vareffect/
fasta.rs

1//! Indexed reference genome reader backed by a flat memory-mapped binary.
2//!
3//! Thin wrapper around [`memmap2::Mmap`] that serves **0-based half-open**
4//! sequences by **UCSC-style chromosome name** on the **plus strand**. All
5//! three conventions match [`crate::types::TranscriptModel`] so downstream
6//! consumers never need to translate between coordinate systems.
7//!
8//! # On-disk format
9//!
10//! The reader expects a pair of files produced by `vareffect-cli setup`:
11//!
12//! - **`GRCh38.bin`** — flat binary genome. Concatenated, newline-stripped,
13//!   uppercased chromosome sequences. One byte per base, standard IUPAC
14//!   nucleotide codes (`A`/`C`/`G`/`T`/`N` plus ambiguity codes `R`/`Y`/`S`/
15//!   `W`/`K`/`M`/`B`/`D`/`H`/`V`). No headers, no line breaks, no padding
16//!   between contigs. ~3.1 GB for GRCh38.p14 (primary + patches).
17//! - **`GRCh38.bin.idx`** — MessagePack-serialized [`GenomeBinIndex`] mapping
18//!   each contig name to its `(offset, length)` in the `.bin` file. ~10 KB.
19//!
20//! Use [`write_genome_binary`] to produce these files from raw contig data
21//! (used by `vareffect-cli setup` and by unit tests).
22//!
23//! # Coordinate convention
24//!
25//! All coordinates exposed by [`FastaReader`] are 0-based half-open
26//! `[start, end)` — identical to `TranscriptModel::tx_start` / `tx_end`.
27//! Converting from a VCF 1-based position is `vcf_pos - 1`.
28//!
29//! # Chromosome name translation
30//!
31//! Callers always use UCSC-style names (`chr1`, `chr17`, `chrX`, `chrY`,
32//! `chrM`, `chr9_KN196479v1_fix`, ...) to stay consistent with
33//! `TranscriptModel::chrom`. The on-disk binary may use any of three naming
34//! conventions (inherited from the source FASTA); the reader detects which
35//! one at open time by scanning the index entries:
36//!
37//! - **NCBI RefSeq** (`NC_000001.11`, `NW_*`, ...) — produced by
38//!   `vareffect-cli setup` from the NCBI GRCh38.p14 assembly. The reader
39//!   translates primary chroms via [`crate::chrom::ucsc_to_refseq`] and
40//!   patch contigs via an optional runtime alias CSV loaded through
41//!   [`FastaReader::open_with_patch_aliases`].
42//! - **UCSC-prefixed** (`chr1`, `chrM`, ...) — pass-through translation.
43//! - **Ensembl bare** (`1`, `MT`, ...) — the reader strips the `chr` prefix
44//!   and maps `chrM -> MT`.
45//!
46//! Patch contigs (`chr9_KN196479v1_fix`, `chr22_KI270879v1_alt`, ...) can
47//! only be served against an NCBI-naming binary when a
48//! `patch_chrom_aliases.csv` is supplied via
49//! [`FastaReader::open_with_patch_aliases`].
50//!
51//! # Thread safety
52//!
53//! [`FastaReader`] is inherently `Send + Sync` — the underlying
54//! [`memmap2::Mmap`] derefs to `&[u8]` with no Mutex required. All threads
55//! can read from the same reader concurrently with zero contention.
56//! [`FastaReader::try_clone`] is retained for API compatibility but simply
57//! clones a handful of `Arc`s — it is no longer needed for parallel
58//! workloads.
59//!
60//! # Soft-masking
61//!
62//! The flat binary stores uppercase IUPAC nucleotide codes. Soft-mask
63//! information (Ensembl lowercase = repeat region) is destroyed at build
64//! time. This matches VEP's internal behavior (which uppercases all fetched
65//! bases) and GA4GH refget v2.0 (specifies uppercase IUPAC).
66//! [`FastaReader::fetch_sequence_raw`] is retained for API compatibility but
67//! returns the same uppercase bytes as [`FastaReader::fetch_sequence`].
68//!
69//! # Usage
70//!
71//! ```no_run
72//! use std::path::Path;
73//! use vareffect::FastaReader;
74//!
75//! // Open the flat binary genome produced by `vareffect-cli setup`.
76//! let reader = FastaReader::open(Path::new("data/vareffect/GRCh38.bin"))?;
77//!
78//! // TP53 c.742C>T lives at chr17:7674221 (1-based VCF) = chr17:7674220 (0-based).
79//! let base = reader.fetch_base("chr17", 7674220)?;
80//! assert_eq!(base, b'C');
81//!
82//! // Patch-contig reads against an NCBI-source binary need the alias CSV.
83//! let reader = FastaReader::open_with_patch_aliases(
84//!     Path::new("data/vareffect/GRCh38.bin"),
85//!     Some(Path::new("data/vareffect/patch_chrom_aliases.csv")),
86//! )?;
87//! # Ok::<(), vareffect::VarEffectError>(())
88//! ```
89
90use std::borrow::Cow;
91use std::collections::HashMap;
92use std::fs::File;
93use std::io::{BufRead, BufReader, BufWriter, Write};
94use std::path::{Path, PathBuf};
95use std::sync::Arc;
96
97use memmap2::Mmap;
98use serde::{Deserialize, Serialize};
99
100use crate::error::VarEffectError;
101
102// ---------------------------------------------------------------------------
103// Index types (serialized as MessagePack in .bin.idx)
104// ---------------------------------------------------------------------------
105
106/// Current format version for [`GenomeBinIndex`]. Increment on breaking
107/// changes to the on-disk layout. Public so `vareffect-cli` uses the same
108/// constant — there must be a single source of truth for reader/writer
109/// version agreement.
110pub const GENOME_BIN_INDEX_VERSION: u32 = 1;
111
112#[cfg(not(target_pointer_width = "64"))]
113compile_error!("vareffect requires a 64-bit target (genome files may exceed 4 GB)");
114
115/// Flat binary genome index.
116///
117/// Serialized as MessagePack and stored alongside the `.bin` file with an
118/// `.idx` suffix. Read once at [`FastaReader::open`] time.
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct GenomeBinIndex {
121    /// Format version. Must equal [`GENOME_BIN_INDEX_VERSION`] at open time.
122    pub version: u32,
123    /// Assembly build label (e.g. `"GRCh38.p14"`). Informational — not
124    /// currently validated at open time, but available for downstream
125    /// logging and safety checks.
126    pub build: String,
127    /// Expected `.bin` file size in bytes. Verified against the mmap length
128    /// at open time to catch truncation.
129    pub expected_size: u64,
130    /// Contigs in file order. Each entry maps a contig name to its byte
131    /// offset and length in the `.bin` file.
132    pub contigs: Vec<ContigEntry>,
133}
134
135/// One contig in the [`GenomeBinIndex`].
136#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct ContigEntry {
138    /// Raw contig name from the source FASTA (e.g. `"NC_000001.11"`).
139    pub name: String,
140    /// Byte offset in the `.bin` file where this contig's sequence starts.
141    pub offset: u64,
142    /// Sequence length in bytes (= number of bases).
143    pub length: u64,
144}
145
146// ---------------------------------------------------------------------------
147// Builder
148// ---------------------------------------------------------------------------
149
150/// Build a flat binary genome from raw contig sequences.
151///
152/// Writes two files:
153/// - `bin_path`: concatenated uppercase bases, one byte per base, no separators.
154/// - `idx_path`: MessagePack-serialized [`GenomeBinIndex`].
155///
156/// Used by `vareffect-cli setup` for production builds and by unit tests
157/// for synthetic genomes.
158///
159/// # Base validation
160///
161/// Every byte is uppercased and then checked against the standard IUPAC
162/// nucleotide alphabet (`A`, `C`, `G`, `T`, `N`, `R`, `Y`, `S`, `W`, `K`,
163/// `M`, `B`, `D`, `H`, `V`). Non-alphabetic or non-IUPAC bytes are rejected.
164/// The NCBI GRCh38.p14 assembly uses ambiguity codes (e.g. `M`, `R`, `Y`)
165/// in some patch-scaffold regions; these are preserved verbatim.
166///
167/// # Errors
168///
169/// * [`VarEffectError::Io`] on write failures.
170/// * [`VarEffectError::Malformed`] if any byte is not a valid IUPAC
171///   nucleotide code after uppercasing.
172pub fn write_genome_binary(
173    contigs: &[(&str, &[u8])],
174    build: &str,
175    bin_path: &Path,
176    idx_path: &Path,
177) -> Result<(), VarEffectError> {
178    let bin_file = File::create(bin_path).map_err(|source| VarEffectError::Io {
179        path: bin_path.to_path_buf(),
180        source,
181    })?;
182    let mut writer = BufWriter::new(bin_file);
183
184    let mut entries = Vec::with_capacity(contigs.len());
185    let mut offset: u64 = 0;
186
187    // 64 KB staging buffer for chunk-based uppercase + validation + write.
188    // Avoids per-byte function-call overhead on multi-GB genomes.
189    let mut buf = [0u8; 64 * 1024];
190
191    for &(name, seq) in contigs {
192        let entry_offset = offset;
193        for chunk in seq.chunks(buf.len()) {
194            let n = chunk.len();
195            buf[..n].copy_from_slice(chunk);
196            buf[..n].make_ascii_uppercase();
197            for (i, &b) in buf[..n].iter().enumerate() {
198                if !is_iupac_nucleotide(b) {
199                    return Err(VarEffectError::Malformed(format!(
200                        "non-IUPAC byte 0x{:02X} ('{}') in contig {name} at offset {}",
201                        chunk[i],
202                        chunk[i] as char,
203                        offset + i as u64,
204                    )));
205                }
206            }
207            writer
208                .write_all(&buf[..n])
209                .map_err(|source| VarEffectError::Io {
210                    path: bin_path.to_path_buf(),
211                    source,
212                })?;
213            offset += n as u64;
214        }
215        entries.push(ContigEntry {
216            name: name.to_string(),
217            offset: entry_offset,
218            length: seq.len() as u64,
219        });
220    }
221
222    writer.flush().map_err(|source| VarEffectError::Io {
223        path: bin_path.to_path_buf(),
224        source,
225    })?;
226    // Ensure data hits disk before writing the index.
227    writer
228        .into_inner()
229        .map_err(|e| VarEffectError::Io {
230            path: bin_path.to_path_buf(),
231            source: e.into_error(),
232        })?
233        .sync_all()
234        .map_err(|source| VarEffectError::Io {
235            path: bin_path.to_path_buf(),
236            source,
237        })?;
238
239    let index = GenomeBinIndex {
240        version: GENOME_BIN_INDEX_VERSION,
241        build: build.to_string(),
242        expected_size: offset,
243        contigs: entries,
244    };
245    let idx_bytes = rmp_serde::to_vec(&index).map_err(|e| VarEffectError::Io {
246        path: idx_path.to_path_buf(),
247        source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
248    })?;
249    // Atomic write: .tmp + rename so a crash never leaves a partial .bin.idx
250    // that the AlreadyPresent fast-path would accept.
251    let tmp_idx = idx_path.with_extension("tmp");
252    std::fs::write(&tmp_idx, &idx_bytes).map_err(|source| VarEffectError::Io {
253        path: idx_path.to_path_buf(),
254        source,
255    })?;
256    std::fs::rename(&tmp_idx, idx_path).map_err(|source| VarEffectError::Io {
257        path: idx_path.to_path_buf(),
258        source,
259    })?;
260
261    Ok(())
262}
263
264/// Check whether a byte is a valid uppercase IUPAC nucleotide code.
265///
266/// Accepts the 15 standard codes: the four canonical bases (`A`, `C`, `G`,
267/// `T`), the fully-ambiguous placeholder (`N`), and the ten two-/three-base
268/// ambiguity codes (`R`, `Y`, `S`, `W`, `K`, `M`, `B`, `D`, `H`, `V`).
269/// The NCBI GRCh38.p14 assembly uses several of these in patch-scaffold
270/// regions; rejecting them would prevent building a complete genome binary.
271#[inline]
272pub fn is_iupac_nucleotide(b: u8) -> bool {
273    matches!(
274        b,
275        b'A' | b'C'
276            | b'G'
277            | b'T'
278            | b'N'
279            | b'R'
280            | b'Y'
281            | b'S'
282            | b'W'
283            | b'K'
284            | b'M'
285            | b'B'
286            | b'D'
287            | b'H'
288            | b'V'
289    )
290}
291
292// ---------------------------------------------------------------------------
293// Contig naming detection
294// ---------------------------------------------------------------------------
295
296/// On-disk contig naming style. Detected by scanning the [`GenomeBinIndex`]
297/// contig names at [`FastaReader::open`] time; drives the translation ladder
298/// inside [`FastaReader::translate_chrom`].
299///
300/// Detection priority: `NC_*` -> `NcbiRefSeq` beats a later `chr*` ->
301/// `UcscPrefixed` so a mixed-naming source (e.g. `NC_*` primaries + `NW_*`
302/// patches) still resolves toward the translation-heavy branch.
303#[derive(Debug, Clone, Copy, PartialEq, Eq)]
304pub(crate) enum ContigNaming {
305    /// Contigs are NCBI RefSeq accessions (`NC_000001.11`, `NW_*`, `NT_*`).
306    NcbiRefSeq,
307    /// Contigs are UCSC-style (`chr1`, `chrM`, `chr9_KN196479v1_fix`).
308    UcscPrefixed,
309    /// Contigs are bare Ensembl names (`1`, `MT`, `X`, `Y`).
310    EnsemblBare,
311}
312
313// ---------------------------------------------------------------------------
314// FastaReader
315// ---------------------------------------------------------------------------
316
317/// Memory-mapped reference genome reader for random-access sequence retrieval.
318///
319/// Construct with [`FastaReader::open`] (or
320/// [`FastaReader::open_with_patch_aliases`] when patch-contig lookups against
321/// an NCBI-source binary are needed), then call
322/// [`FastaReader::fetch_sequence`] to pull bases by `(chrom, start, end)`.
323/// Coordinates are 0-based half-open; chromosome names are UCSC-style.
324///
325/// See the module docs for details on the on-disk format, coordinate
326/// conventions, chromosome name translation, and thread safety.
327pub struct FastaReader {
328    /// Memory-mapped genome binary. `Mmap` derefs to `&[u8]` and is
329    /// `Send + Sync`, so no Mutex is needed for concurrent reads.
330    mmap: Arc<Mmap>,
331
332    /// Per-contig `(byte_offset, length)` keyed by the **raw index name**
333    /// (the exact name stored in the `.bin.idx`). The translation ladder
334    /// in [`FastaReader::translate_chrom`] converts caller-side UCSC names
335    /// *into* raw index keys before probing this map.
336    contigs: Arc<HashMap<String, (u64, u64)>>,
337
338    /// Detected on-disk naming convention. See [`ContigNaming`].
339    naming: ContigNaming,
340
341    /// Optional UCSC -> RefSeq patch alias table. Only populated when the
342    /// caller supplies a `patch_chrom_aliases.csv` to
343    /// [`FastaReader::open_with_patch_aliases`] *and* the binary uses
344    /// [`ContigNaming::NcbiRefSeq`].
345    patch_aliases: Option<Arc<HashMap<String, String>>>,
346
347    /// Original `.bin` path, kept for diagnostic messages.
348    path: PathBuf,
349}
350
351impl FastaReader {
352    /// Open a flat binary genome file without a patch-contig alias table.
353    ///
354    /// This is the common-case constructor for callers that only need
355    /// primary-chromosome reads (`chr1`..`chrM`). If the binary uses NCBI
356    /// RefSeq naming and you need patch-contig reads as well, use
357    /// [`FastaReader::open_with_patch_aliases`] instead.
358    ///
359    /// The expected on-disk layout is:
360    /// - `path` — the `.bin` file (flat binary genome)
361    /// - `{path}.idx` — the MessagePack index (sibling file)
362    ///
363    /// # Errors
364    ///
365    /// * [`VarEffectError::IndexNotFound`] if the `.idx` sidecar is missing.
366    /// * [`VarEffectError::Io`] on I/O or deserialization errors.
367    /// * [`VarEffectError::Malformed`] if the index version is unsupported
368    ///   or the `.bin` file size doesn't match the expected size.
369    pub fn open(path: &Path) -> Result<Self, VarEffectError> {
370        Self::open_with_patch_aliases(path, None)
371    }
372
373    /// Open a flat binary genome file, optionally with a patch-contig alias
374    /// table for NCBI-source binaries.
375    ///
376    /// `patch_aliases_csv` is a path to a `refseq,ucsc` CSV produced by
377    /// `vareffect-cli setup`. When supplied *and* the binary uses
378    /// NCBI RefSeq contig naming, the reader loads the CSV into a
379    /// UCSC -> RefSeq map for the second tier of the translation ladder.
380    /// For any other combination, the argument is silently ignored.
381    ///
382    /// See [`FastaReader::open`] for the expected on-disk layout and error
383    /// conditions.
384    pub fn open_with_patch_aliases(
385        path: &Path,
386        patch_aliases_csv: Option<&Path>,
387    ) -> Result<Self, VarEffectError> {
388        // 1. Locate and parse the .bin.idx sidecar.
389        let idx_path = append_idx_extension(path);
390        if !idx_path.exists() {
391            return Err(VarEffectError::IndexNotFound {
392                path: idx_path.display().to_string(),
393            });
394        }
395        let idx_bytes = std::fs::read(&idx_path).map_err(|source| VarEffectError::Io {
396            path: idx_path.clone(),
397            source,
398        })?;
399        let index: GenomeBinIndex =
400            rmp_serde::from_slice(&idx_bytes).map_err(|e| VarEffectError::Io {
401                path: idx_path.clone(),
402                source: std::io::Error::new(std::io::ErrorKind::InvalidData, e),
403            })?;
404
405        // 2. Validate index version.
406        if index.version != GENOME_BIN_INDEX_VERSION {
407            return Err(VarEffectError::Malformed(format!(
408                "unsupported genome binary index version {} (expected {})",
409                index.version, GENOME_BIN_INDEX_VERSION,
410            )));
411        }
412
413        // 3. Memory-map the .bin file.
414        let file = File::open(path).map_err(|source| VarEffectError::Io {
415            path: path.to_path_buf(),
416            source,
417        })?;
418        // SAFETY: The .bin file is opened read-only and is write-once
419        // (produced by `vareffect-cli setup`, never modified in place).
420        // Hot-reload uses ArcSwap to atomically swap entire FastaReader
421        // instances rather than modifying the underlying file.
422        let mmap = unsafe {
423            memmap2::MmapOptions::new()
424                .map(&file)
425                .map_err(|source| VarEffectError::Io {
426                    path: path.to_path_buf(),
427                    source,
428                })?
429        };
430
431        // 4. Truncation guard: verify the mmap length matches the expected
432        //    size stored in the index.
433        if mmap.len() as u64 != index.expected_size {
434            return Err(VarEffectError::Malformed(format!(
435                "genome binary size mismatch: expected {} bytes, got {} — \
436                 file may be truncated or corrupt",
437                index.expected_size,
438                mmap.len(),
439            )));
440        }
441
442        // 5. Build the contigs lookup and detect naming convention.
443        let mut contigs: HashMap<String, (u64, u64)> = HashMap::with_capacity(index.contigs.len());
444        let mut naming = ContigNaming::EnsemblBare;
445
446        for entry in &index.contigs {
447            // Detection: NC_* wins outright; chr* flips to UcscPrefixed
448            // only if we haven't already seen an NC_* record.
449            if entry.name.starts_with("NC_") {
450                naming = ContigNaming::NcbiRefSeq;
451            } else if entry.name.starts_with("chr") && naming != ContigNaming::NcbiRefSeq {
452                naming = ContigNaming::UcscPrefixed;
453            }
454
455            contigs.insert(entry.name.clone(), (entry.offset, entry.length));
456        }
457
458        // 6. Load the patch alias CSV only when it was provided *and* the
459        //    binary actually uses NCBI RefSeq naming.
460        let patch_aliases = match (patch_aliases_csv, naming) {
461            (Some(csv_path), ContigNaming::NcbiRefSeq) => {
462                Some(Arc::new(load_ucsc_to_refseq_aliases(csv_path)?))
463            }
464            _ => None,
465        };
466
467        Ok(Self {
468            mmap: Arc::new(mmap),
469            contigs: Arc::new(contigs),
470            naming,
471            patch_aliases,
472            path: path.to_path_buf(),
473        })
474    }
475
476    /// Mint an additional reader handle for the same genome binary.
477    ///
478    /// With the memory-mapped backend this is trivially cheap — it clones
479    /// a handful of `Arc`s. The `Result` wrapper is retained for API
480    /// compatibility with callers written for the previous seek-based
481    /// reader; it always returns `Ok`.
482    ///
483    /// **Note:** With the mmap backend, `try_clone` is no longer needed for
484    /// parallel workloads. All threads can read from the same `FastaReader`
485    /// concurrently with zero contention.
486    pub fn try_clone(&self) -> Result<Self, VarEffectError> {
487        Ok(Self {
488            mmap: Arc::clone(&self.mmap),
489            contigs: Arc::clone(&self.contigs),
490            naming: self.naming,
491            patch_aliases: self.patch_aliases.as_ref().map(Arc::clone),
492            path: self.path.clone(),
493        })
494    }
495
496    /// Fetch a genomic sequence as uppercase ASCII bytes.
497    ///
498    /// Coordinates are 0-based half-open `[start, end)`. The returned
499    /// sequence is the plus-strand reference — strand-aware complementing
500    /// is the caller's responsibility (see `crate::types::Strand`).
501    ///
502    /// The flat binary stores uppercase bases only, so the returned bytes
503    /// are always uppercase IUPAC nucleotide codes (typically `A`/`C`/`G`/
504    /// `T`/`N`, occasionally ambiguity codes like `M`/`R`/`Y` in some
505    /// GRCh38 patch regions).
506    ///
507    /// # Errors
508    ///
509    /// * [`VarEffectError::ChromNotFound`] if `chrom` is not present in the
510    ///   genome index.
511    /// * [`VarEffectError::CoordinateOutOfRange`] if `start >= end` or `end`
512    ///   exceeds the chromosome length.
513    pub fn fetch_sequence(
514        &self,
515        chrom: &str,
516        start: u64,
517        end: u64,
518    ) -> Result<Vec<u8>, VarEffectError> {
519        let translated = self.translate_chrom(chrom);
520
521        let &(offset, length) =
522            self.contigs
523                .get(translated.as_ref())
524                .ok_or_else(|| VarEffectError::ChromNotFound {
525                    chrom: chrom.to_string(),
526                })?;
527
528        if start >= end || end > length {
529            return Err(VarEffectError::CoordinateOutOfRange {
530                chrom: chrom.to_string(),
531                start,
532                end,
533                chrom_len: length,
534            });
535        }
536
537        // Bounds validated — mmap indexing cannot panic.
538        let slice_start = (offset + start) as usize;
539        let slice_end = (offset + end) as usize;
540        Ok(self.mmap[slice_start..slice_end].to_vec())
541    }
542
543    /// Fetch a genomic sequence as raw ASCII bytes.
544    ///
545    /// With the flat binary backend, this is functionally identical to
546    /// [`FastaReader::fetch_sequence`] — the binary stores uppercase bases
547    /// only, so soft-mask (lowercase) information is not preserved.
548    ///
549    /// Retained for API compatibility. Callers that need repeat-region
550    /// annotations should use a dedicated RepeatMasker store rather than
551    /// relying on FASTA soft-masking.
552    pub fn fetch_sequence_raw(
553        &self,
554        chrom: &str,
555        start: u64,
556        end: u64,
557    ) -> Result<Vec<u8>, VarEffectError> {
558        self.fetch_sequence(chrom, start, end)
559    }
560
561    /// Fetch a single base at the given 0-based position.
562    ///
563    /// Returns the uppercase ASCII byte. With the mmap backend, this is a
564    /// single pointer dereference (~5 ns).
565    ///
566    /// # Errors
567    ///
568    /// Same as [`FastaReader::fetch_sequence`].
569    pub fn fetch_base(&self, chrom: &str, pos: u64) -> Result<u8, VarEffectError> {
570        let translated = self.translate_chrom(chrom);
571        let &(offset, length) =
572            self.contigs
573                .get(translated.as_ref())
574                .ok_or_else(|| VarEffectError::ChromNotFound {
575                    chrom: chrom.to_string(),
576                })?;
577
578        if pos >= length {
579            return Err(VarEffectError::CoordinateOutOfRange {
580                chrom: chrom.to_string(),
581                start: pos,
582                end: pos + 1,
583                chrom_len: length,
584            });
585        }
586
587        Ok(self.mmap[(offset + pos) as usize])
588    }
589
590    /// Verify that the reference allele at a genomic position matches the
591    /// genome binary. Returns `true` on match, `false` on mismatch.
592    ///
593    /// Both the stored bases and `ref_allele` are compared case-insensitively.
594    /// I/O or coordinate errors propagate as `Err` — only a clean byte-level
595    /// mismatch returns `Ok(false)`. This mirrors VCF `--check-ref` semantics.
596    ///
597    /// With the mmap backend, this is a zero-copy slice comparison — no
598    /// `Vec<u8>` allocation.
599    pub fn verify_ref(
600        &self,
601        chrom: &str,
602        pos: u64,
603        ref_allele: &[u8],
604    ) -> Result<bool, VarEffectError> {
605        if ref_allele.is_empty() {
606            return Ok(true);
607        }
608
609        let translated = self.translate_chrom(chrom);
610        let &(offset, length) =
611            self.contigs
612                .get(translated.as_ref())
613                .ok_or_else(|| VarEffectError::ChromNotFound {
614                    chrom: chrom.to_string(),
615                })?;
616
617        let end_pos = pos + ref_allele.len() as u64;
618        if end_pos > length {
619            return Err(VarEffectError::CoordinateOutOfRange {
620                chrom: chrom.to_string(),
621                start: pos,
622                end: end_pos,
623                chrom_len: length,
624            });
625        }
626
627        // Bounds validated — mmap indexing cannot panic.
628        let start = (offset + pos) as usize;
629        let end = start + ref_allele.len();
630        let slice = &self.mmap[start..end];
631        Ok(slice
632            .iter()
633            .zip(ref_allele.iter())
634            .all(|(a, b)| a.eq_ignore_ascii_case(b)))
635    }
636
637    /// Return the length of `chrom` in bases, or `None` if the chromosome is
638    /// not present in the genome index.
639    ///
640    /// O(1) — reads from the contig table built at [`FastaReader::open`] time.
641    pub fn chrom_length(&self, chrom: &str) -> Option<u64> {
642        let translated = self.translate_chrom(chrom);
643        self.contigs
644            .get(translated.as_ref())
645            .map(|&(_, length)| length)
646    }
647
648    // ---------------------------------------------------------------
649    // Internal helpers
650    // ---------------------------------------------------------------
651
652    /// Translate a caller-supplied UCSC chromosome name into the raw index
653    /// name. The three-branch dispatcher covers every supported on-disk
654    /// naming; see [`ContigNaming`] for the detection logic.
655    ///
656    /// Returns a `Cow` so the common-case branches (NCBI primary chrom,
657    /// UCSC pass-through, Ensembl strip-`chr`) allocate nothing. The
658    /// NcbiRefSeq patch-alias tier is the only path that allocates: the
659    /// alias map stores owned `String` values, and returning a borrow of
660    /// that string would require the returned `Cow`'s lifetime to extend
661    /// beyond the caller-supplied `chrom`, which the function signature
662    /// doesn't allow.
663    fn translate_chrom<'a>(&'a self, chrom: &'a str) -> Cow<'a, str> {
664        match self.naming {
665            ContigNaming::NcbiRefSeq => {
666                // Tier 1: primary chroms via the 25-entry const table.
667                // `ucsc_to_refseq` returns `&'static str` on a hit, so the
668                // `Cow::Borrowed` here has a 'static lifetime — always safe
669                // to coerce into 'a.
670                let primary = crate::chrom::ucsc_to_refseq(chrom);
671                if !std::ptr::eq(primary.as_ptr(), chrom.as_ptr()) {
672                    // `primary` points into `NC_TO_UCSC`, not into `chrom`
673                    // — the pointer compare distinguishes "table hit" from
674                    // "pass-through" without a second string compare.
675                    return Cow::Borrowed(primary);
676                }
677                // Tier 2: patch contigs via the runtime alias map, if one
678                // was supplied at open time.
679                if let Some(aliases) = &self.patch_aliases
680                    && let Some(refseq) = aliases.get(chrom)
681                {
682                    return Cow::Owned(refseq.clone());
683                }
684                // Tier 3: unknown name — pass through. The caller will get
685                // a `ChromNotFound` from the contigs lookup, which is the
686                // desired failure mode for typos or unknown patches.
687                Cow::Borrowed(chrom)
688            }
689            ContigNaming::UcscPrefixed => {
690                // Binary uses UCSC names — no translation needed.
691                Cow::Borrowed(chrom)
692            }
693            ContigNaming::EnsemblBare => {
694                if chrom == "chrM" {
695                    // Mitochondrial contig is special: Ensembl calls it `MT`.
696                    Cow::Owned("MT".to_string())
697                } else if let Some(stripped) = chrom.strip_prefix("chr") {
698                    Cow::Borrowed(stripped)
699                } else {
700                    // Caller passed a name that doesn't start with `chr`;
701                    // hand it through unchanged so tests and ad-hoc
702                    // callers can still query by the raw index name.
703                    Cow::Borrowed(chrom)
704                }
705            }
706        }
707    }
708}
709
710/// Load `patch_chrom_aliases.csv` into a UCSC -> RefSeq lookup map.
711///
712/// The on-disk CSV format is `refseq,ucsc` (see
713/// `vareffect-cli::builders::patch_chrom_aliases`), so this loader inverts
714/// the direction at parse time. The same tolerances as the builder-side
715/// loader apply: `#` comments, blank lines, the header row, and leading/
716/// trailing whitespace are skipped.
717///
718/// We deliberately duplicate ~25 LOC of CSV parsing here rather than
719/// depending on `vareffect-cli`: the `vareffect` crate is designed to be
720/// publishable standalone on crates.io, and a dependency on the workspace
721/// CLI would defeat that.
722fn load_ucsc_to_refseq_aliases(path: &Path) -> Result<HashMap<String, String>, VarEffectError> {
723    let file = File::open(path).map_err(|source| VarEffectError::Io {
724        path: path.to_path_buf(),
725        source,
726    })?;
727    let reader = BufReader::new(file);
728
729    let mut map: HashMap<String, String> = HashMap::new();
730    for line in reader.lines() {
731        let line = line.map_err(|source| VarEffectError::Io {
732            path: path.to_path_buf(),
733            source,
734        })?;
735        let trimmed = line.trim();
736        if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("refseq,") {
737            continue;
738        }
739        let Some((refseq, ucsc)) = trimmed.split_once(',') else {
740            continue;
741        };
742        let refseq = refseq.trim();
743        let ucsc = ucsc.trim();
744        if refseq.is_empty() || ucsc.is_empty() {
745            continue;
746        }
747        // Inverted direction: key on UCSC, value is RefSeq.
748        map.insert(ucsc.to_string(), refseq.to_string());
749    }
750    Ok(map)
751}
752
753impl std::fmt::Debug for FastaReader {
754    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
755        f.debug_struct("FastaReader")
756            .field("path", &self.path)
757            .field("naming", &self.naming)
758            .field("patch_aliases", &self.patch_aliases.is_some())
759            .field("contigs", &self.contigs.len())
760            .finish()
761    }
762}
763
764/// Derive the `.bin.idx` sidecar path from the `.bin` path by appending
765/// `.idx` to the full filename.
766fn append_idx_extension(path: &Path) -> PathBuf {
767    let mut os = path.as_os_str().to_os_string();
768    os.push(".idx");
769    PathBuf::from(os)
770}
771
772// ---------------------------------------------------------------------------
773// Tests
774// ---------------------------------------------------------------------------
775
776#[cfg(test)]
777mod tests {
778    use super::*;
779    use tempfile::TempDir;
780
781    /// Build a flat binary genome from contigs and open it as a `FastaReader`.
782    fn write_test_genome(contigs: &[(&str, &[u8])]) -> (TempDir, FastaReader) {
783        let tmp = TempDir::new().unwrap();
784        let bin_path = tmp.path().join("test.bin");
785        let idx_path = tmp.path().join("test.bin.idx");
786        write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
787        let reader = FastaReader::open(&bin_path).unwrap();
788        (tmp, reader)
789    }
790
791    /// Write a `refseq,ucsc` alias CSV to a tempdir and return the path.
792    fn write_patch_alias_csv(tmp: &TempDir) -> PathBuf {
793        let csv_path = tmp.path().join("patch_chrom_aliases.csv");
794        let contents = "\
795# GRCh38 patch-contig aliases: RefSeq accession -> UCSC contig name.
796refseq,ucsc
797NW_009646194.1,chr1_KN196472v1_fix
798NT_187633.1,chr22_KI270879v1_alt
799";
800        std::fs::write(&csv_path, contents).unwrap();
801        csv_path
802    }
803
804    // -- Naming detection --------------------------------------------------
805
806    #[test]
807    fn chrom_name_translation_ensembl_style() {
808        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT"), ("MT", b"NNNNACGTACGT")];
809        let (_tmp, reader) = write_test_genome(contigs);
810
811        assert_eq!(reader.naming, ContigNaming::EnsemblBare);
812
813        // chr1 -> 1
814        let seq = reader.fetch_sequence("chr1", 0, 4).unwrap();
815        assert_eq!(seq, b"ACGT");
816
817        // chrM -> MT, start at position 4 (the first `A`)
818        let seq = reader.fetch_sequence("chrM", 4, 8).unwrap();
819        assert_eq!(seq, b"ACGT");
820    }
821
822    #[test]
823    fn chrom_name_translation_ucsc_style() {
824        let contigs: &[(&str, &[u8])] = &[("chr1", b"ACGTACGTACGTACGT")];
825        let (_tmp, reader) = write_test_genome(contigs);
826
827        assert_eq!(reader.naming, ContigNaming::UcscPrefixed);
828
829        // UCSC name passes through unchanged.
830        let seq = reader.fetch_sequence("chr1", 0, 4).unwrap();
831        assert_eq!(seq, b"ACGT");
832    }
833
834    #[test]
835    fn ucsc_naming_detected_even_without_chr1() {
836        // A partial assembly without chr1 should still detect UCSC style.
837        let contigs: &[(&str, &[u8])] = &[("chr22", b"ACGT"), ("chrY", b"TTTT")];
838        let (_tmp, reader) = write_test_genome(contigs);
839
840        assert_eq!(
841            reader.naming,
842            ContigNaming::UcscPrefixed,
843            "partial assembly should still be flagged UCSC-style",
844        );
845        assert_eq!(reader.fetch_base("chr22", 0).unwrap(), b'A');
846        assert_eq!(reader.fetch_base("chrY", 0).unwrap(), b'T');
847    }
848
849    #[test]
850    fn ncbi_naming_detected_with_nc_prefix() {
851        let contigs: &[(&str, &[u8])] = &[
852            ("NC_000001.11", b"ACGTACGTACGTACGT"),
853            ("NW_009646194.1", b"GGGGCCCC"),
854        ];
855        let (_tmp, reader) = write_test_genome(contigs);
856
857        assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
858        // Caller supplies UCSC form — reader translates to NC_000001.11.
859        assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
860        assert_eq!(reader.fetch_sequence("chr1", 0, 4).unwrap(), b"ACGT");
861        assert_eq!(reader.chrom_length("chr1"), Some(16));
862    }
863
864    #[test]
865    fn mixed_ncbi_naming_nc_plus_nw() {
866        // Both NC_ and NW_ contigs — NW_ doesn't match NC_ or chr checks,
867        // naming should stay NcbiRefSeq.
868        let contigs: &[(&str, &[u8])] = &[
869            ("NC_000001.11", b"AAAA"),
870            ("NW_009646194.1", b"CCCC"),
871            ("NT_187633.1", b"GGGG"),
872        ];
873        let (_tmp, reader) = write_test_genome(contigs);
874        assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
875    }
876
877    // -- Coordinate validation ---------------------------------------------
878
879    #[test]
880    fn coordinate_validation_start_ge_end() {
881        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
882        let (_tmp, reader) = write_test_genome(contigs);
883
884        let err = reader.fetch_sequence("chr1", 5, 5).unwrap_err();
885        assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
886
887        let err = reader.fetch_sequence("chr1", 10, 5).unwrap_err();
888        assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
889    }
890
891    #[test]
892    fn coordinate_validation_out_of_range() {
893        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
894        let (_tmp, reader) = write_test_genome(contigs);
895
896        // Contig is 16 bases, so end > 16 is out of range.
897        let err = reader.fetch_sequence("chr1", 0, 17).unwrap_err();
898        assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
899    }
900
901    #[test]
902    fn fetch_base_rejects_position_past_end_of_chrom() {
903        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
904        let (_tmp, reader) = write_test_genome(contigs);
905        // 16 bases — valid positions are 0..16.
906        let err = reader.fetch_base("chr1", 16).unwrap_err();
907        assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
908    }
909
910    #[test]
911    fn unknown_chrom_returns_err() {
912        let contigs: &[(&str, &[u8])] = &[("1", b"ACGT")];
913        let (_tmp, reader) = write_test_genome(contigs);
914
915        let err = reader.fetch_sequence("chrZZ", 0, 4).unwrap_err();
916        assert!(matches!(err, VarEffectError::ChromNotFound { .. }));
917    }
918
919    // -- Fetch operations --------------------------------------------------
920
921    #[test]
922    fn fetch_base_returns_single_byte() {
923        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
924        let (_tmp, reader) = write_test_genome(contigs);
925
926        assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
927        assert_eq!(reader.fetch_base("chr1", 3).unwrap(), b'T');
928    }
929
930    #[test]
931    fn fetch_sequence_returns_correct_range() {
932        let contigs: &[(&str, &[u8])] = &[("chr1", b"ACGTACGTACGTACGT"), ("chrM", b"NNNNACGTACGT")];
933        let (_tmp, reader) = write_test_genome(contigs);
934
935        assert_eq!(reader.fetch_sequence("chr1", 0, 4).unwrap(), b"ACGT");
936        assert_eq!(reader.fetch_sequence("chr1", 4, 8).unwrap(), b"ACGT");
937        assert_eq!(reader.fetch_sequence("chrM", 0, 4).unwrap(), b"NNNN");
938        assert_eq!(reader.fetch_sequence("chrM", 4, 8).unwrap(), b"ACGT");
939    }
940
941    #[test]
942    fn fetch_sequence_uppercases_lowercase_input() {
943        // Builder uppercases — verify lowercase input survives.
944        let contigs: &[(&str, &[u8])] = &[("1", b"acgtacgt")];
945        let (_tmp, reader) = write_test_genome(contigs);
946        let seq = reader.fetch_sequence("chr1", 0, 8).unwrap();
947        assert_eq!(seq, b"ACGTACGT");
948    }
949
950    #[test]
951    fn fetch_sequence_raw_returns_same_as_fetch_sequence() {
952        // With flat binary, raw and uppercased are identical.
953        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGT")];
954        let (_tmp, reader) = write_test_genome(contigs);
955
956        let raw = reader.fetch_sequence_raw("chr1", 0, 8).unwrap();
957        let upper = reader.fetch_sequence("chr1", 0, 8).unwrap();
958        assert_eq!(raw, upper);
959    }
960
961    // -- verify_ref --------------------------------------------------------
962
963    #[test]
964    fn verify_ref_match_and_mismatch() {
965        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
966        let (_tmp, reader) = write_test_genome(contigs);
967
968        assert!(reader.verify_ref("chr1", 0, b"ACGT").unwrap());
969        assert!(!reader.verify_ref("chr1", 0, b"TTTT").unwrap());
970
971        // Case-insensitive: caller passes lowercase, binary is uppercase.
972        assert!(reader.verify_ref("chr1", 0, b"acgt").unwrap());
973
974        // Empty allele trivially verifies.
975        assert!(reader.verify_ref("chr1", 0, b"").unwrap());
976    }
977
978    #[test]
979    fn verify_ref_out_of_bounds_returns_err() {
980        let contigs: &[(&str, &[u8])] = &[("1", b"ACGT")];
981        let (_tmp, reader) = write_test_genome(contigs);
982
983        // pos + ref_allele.len() > length
984        let err = reader.verify_ref("chr1", 2, b"ACGT").unwrap_err();
985        assert!(matches!(err, VarEffectError::CoordinateOutOfRange { .. }));
986    }
987
988    // -- chrom_length ------------------------------------------------------
989
990    #[test]
991    fn chrom_length_reports_correct_values() {
992        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT"), ("MT", b"NNNNACGTACGT")];
993        let (_tmp, reader) = write_test_genome(contigs);
994
995        assert_eq!(reader.chrom_length("chr1"), Some(16));
996        assert_eq!(reader.chrom_length("chrM"), Some(12));
997        assert_eq!(reader.chrom_length("chrZZ"), None);
998    }
999
1000    // -- try_clone ---------------------------------------------------------
1001
1002    #[test]
1003    fn try_clone_produces_independent_reader_sharing_mmap() {
1004        let contigs: &[(&str, &[u8])] = &[("1", b"ACGTACGTACGTACGT")];
1005        let (_tmp, reader) = write_test_genome(contigs);
1006        let cloned = reader.try_clone().expect("try_clone must succeed");
1007
1008        assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
1009        assert_eq!(cloned.fetch_base("chr1", 0).unwrap(), b'A');
1010
1011        // Shared Arc -> same allocation.
1012        assert!(
1013            Arc::ptr_eq(&reader.mmap, &cloned.mmap),
1014            "mmap Arc must be shared across clones"
1015        );
1016        assert!(
1017            Arc::ptr_eq(&reader.contigs, &cloned.contigs),
1018            "contigs Arc must be shared across clones"
1019        );
1020    }
1021
1022    // -- Index validation --------------------------------------------------
1023
1024    #[test]
1025    fn missing_idx_reports_index_not_found() {
1026        let tmp = TempDir::new().unwrap();
1027        let bin_path = tmp.path().join("no_index.bin");
1028        std::fs::write(&bin_path, b"ACGT").unwrap();
1029
1030        let err = FastaReader::open(&bin_path).unwrap_err();
1031        assert!(matches!(err, VarEffectError::IndexNotFound { .. }));
1032    }
1033
1034    #[test]
1035    fn truncated_bin_reports_malformed() {
1036        let tmp = TempDir::new().unwrap();
1037        let bin_path = tmp.path().join("truncated.bin");
1038        let idx_path = tmp.path().join("truncated.bin.idx");
1039
1040        // Write a valid index claiming 100 bytes...
1041        let index = GenomeBinIndex {
1042            version: GENOME_BIN_INDEX_VERSION,
1043            build: "test".into(),
1044            expected_size: 100,
1045            contigs: vec![ContigEntry {
1046                name: "chr1".into(),
1047                offset: 0,
1048                length: 100,
1049            }],
1050        };
1051        let idx_bytes = rmp_serde::to_vec(&index).unwrap();
1052        std::fs::write(&idx_path, &idx_bytes).unwrap();
1053        // ...but only write 10 bytes to the bin file.
1054        std::fs::write(&bin_path, [b'A'; 10]).unwrap();
1055
1056        let err = FastaReader::open(&bin_path).unwrap_err();
1057        assert!(matches!(err, VarEffectError::Malformed(_)));
1058    }
1059
1060    #[test]
1061    fn wrong_index_version_reports_malformed() {
1062        let tmp = TempDir::new().unwrap();
1063        let bin_path = tmp.path().join("badver.bin");
1064        let idx_path = tmp.path().join("badver.bin.idx");
1065
1066        std::fs::write(&bin_path, b"ACGT").unwrap();
1067        let index = GenomeBinIndex {
1068            version: 99,
1069            build: "test".into(),
1070            expected_size: 4,
1071            contigs: vec![ContigEntry {
1072                name: "chr1".into(),
1073                offset: 0,
1074                length: 4,
1075            }],
1076        };
1077        let idx_bytes = rmp_serde::to_vec(&index).unwrap();
1078        std::fs::write(&idx_path, &idx_bytes).unwrap();
1079
1080        let err = FastaReader::open(&bin_path).unwrap_err();
1081        assert!(matches!(err, VarEffectError::Malformed(_)));
1082    }
1083
1084    // -- Multi-contig boundary ---------------------------------------------
1085
1086    #[test]
1087    fn multi_contig_boundary_no_cross_contamination() {
1088        // chr1 ends with TTTT, chr2 starts with AAAA. Fetching the last
1089        // byte of chr1 and the first byte of chr2 must not cross over.
1090        let contigs: &[(&str, &[u8])] = &[("chr1", b"GGGGTTTT"), ("chr2", b"AAAACCCC")];
1091        let (_tmp, reader) = write_test_genome(contigs);
1092
1093        // Last byte of chr1
1094        assert_eq!(reader.fetch_base("chr1", 7).unwrap(), b'T');
1095        // First byte of chr2
1096        assert_eq!(reader.fetch_base("chr2", 0).unwrap(), b'A');
1097
1098        // Range fetch at the boundary
1099        assert_eq!(reader.fetch_sequence("chr1", 4, 8).unwrap(), b"TTTT");
1100        assert_eq!(reader.fetch_sequence("chr2", 0, 4).unwrap(), b"AAAA");
1101    }
1102
1103    // -- Patch aliases -----------------------------------------------------
1104
1105    #[test]
1106    fn ncbi_fasta_rejects_ucsc_patch_without_alias_csv() {
1107        let contigs: &[(&str, &[u8])] = &[
1108            ("NC_000001.11", b"ACGTACGTACGTACGT"),
1109            ("NW_009646194.1", b"GGGGCCCC"),
1110        ];
1111        let (_tmp, reader) = write_test_genome(contigs);
1112
1113        let err = reader.fetch_base("chr1_KN196472v1_fix", 0).unwrap_err();
1114        assert!(
1115            matches!(err, VarEffectError::ChromNotFound { .. }),
1116            "expected ChromNotFound, got {err:?}",
1117        );
1118    }
1119
1120    #[test]
1121    fn ncbi_fasta_resolves_ucsc_patch_via_alias_csv() {
1122        let tmp = TempDir::new().unwrap();
1123        let bin_path = tmp.path().join("ncbi.bin");
1124        let idx_path = tmp.path().join("ncbi.bin.idx");
1125        let contigs: &[(&str, &[u8])] = &[
1126            ("NC_000001.11", b"ACGTACGTACGTACGT"),
1127            ("NW_009646194.1", b"GGGGCCCC"),
1128        ];
1129        write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
1130
1131        let csv_path = write_patch_alias_csv(&tmp);
1132        let reader = FastaReader::open_with_patch_aliases(&bin_path, Some(&csv_path)).unwrap();
1133
1134        assert_eq!(reader.naming, ContigNaming::NcbiRefSeq);
1135        assert!(reader.patch_aliases.is_some());
1136
1137        // `chr1_KN196472v1_fix` -> `NW_009646194.1` via alias.
1138        assert_eq!(reader.fetch_base("chr1_KN196472v1_fix", 0).unwrap(), b'G');
1139        assert_eq!(
1140            reader.fetch_sequence("chr1_KN196472v1_fix", 0, 4).unwrap(),
1141            b"GGGG",
1142        );
1143    }
1144
1145    #[test]
1146    fn ncbi_fasta_ignores_alias_csv_for_non_ncbi_binary() {
1147        let tmp = TempDir::new().unwrap();
1148        let bin_path = tmp.path().join("ens.bin");
1149        let idx_path = tmp.path().join("ens.bin.idx");
1150        let contigs: &[(&str, &[u8])] = &[("1", b"ACGT"), ("MT", b"NNNN")];
1151        write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
1152
1153        let csv_path = write_patch_alias_csv(&tmp);
1154        let reader = FastaReader::open_with_patch_aliases(&bin_path, Some(&csv_path)).unwrap();
1155
1156        assert_eq!(reader.naming, ContigNaming::EnsemblBare);
1157        assert!(
1158            reader.patch_aliases.is_none(),
1159            "patch aliases must not load against a non-NCBI binary",
1160        );
1161        assert_eq!(reader.fetch_base("chr1", 0).unwrap(), b'A');
1162    }
1163
1164    // -- Builder validation ------------------------------------------------
1165
1166    #[test]
1167    fn builder_rejects_non_iupac_bytes() {
1168        let tmp = TempDir::new().unwrap();
1169        let bin_path = tmp.path().join("bad.bin");
1170        let idx_path = tmp.path().join("bad.bin.idx");
1171
1172        // 'X' is not a valid IUPAC nucleotide code.
1173        let err =
1174            write_genome_binary(&[("chr1", b"ACGTXN")], "test", &bin_path, &idx_path).unwrap_err();
1175        assert!(matches!(err, VarEffectError::Malformed(_)));
1176    }
1177
1178    #[test]
1179    fn builder_accepts_iupac_ambiguity_codes() {
1180        // The NCBI GRCh38.p14 assembly uses ambiguity codes (M, R, Y, etc.)
1181        // in some patch-scaffold regions. The builder must accept all 15
1182        // standard IUPAC nucleotide codes.
1183        let seq = b"ACGTNRYSWKMBDHV";
1184        let contigs: &[(&str, &[u8])] = &[("chr1", seq.as_slice())];
1185        let (_tmp, reader) = write_test_genome(contigs);
1186        let fetched = reader.fetch_sequence("chr1", 0, seq.len() as u64).unwrap();
1187        assert_eq!(fetched, seq.as_slice());
1188    }
1189
1190    #[test]
1191    fn builder_round_trips_all_valid_bases() {
1192        let seq = b"ACGTNNNTTTAAACCCGGG";
1193        let contigs: &[(&str, &[u8])] = &[("chr1", seq.as_slice())];
1194        let (_tmp, reader) = write_test_genome(contigs);
1195        let fetched = reader.fetch_sequence("chr1", 0, seq.len() as u64).unwrap();
1196        assert_eq!(fetched, seq.as_slice());
1197    }
1198
1199    // -- append_idx_extension ----------------------------------------------
1200
1201    #[test]
1202    fn append_idx_extension_preserves_existing_ext() {
1203        let out = append_idx_extension(Path::new("/tmp/GRCh38.bin"));
1204        assert_eq!(out, PathBuf::from("/tmp/GRCh38.bin.idx"));
1205    }
1206}
vareffect/fasta.rs

vareffect/
fasta.rs