vareffect 0.1.3

Variant consequence prediction and HGVS notation, concordant with Ensembl VEP.
Documentation
//! Chromosome name conversion.
//!
//! Maps NCBI RefSeq accessions (e.g. `"NC_000006.12"`) to UCSC-style
//! chromosome names (e.g. `"chr6"`). Patch-sequence accessions (`NW_*`,
//! `NT_*`) are returned unchanged so the transcripts on them remain
//! round-trippable, even though they can't be looked up by standard
//! chromosome name.
//!
//! This is a hardcoded table for the 25 standard GRCh38 chromosomes.
//! Unknown inputs (anything not in the table and not recognized as a
//! patch accession) are returned as-is.

/// GRCh38 `NC_*` accession → UCSC chromosome name.
///
/// The table uses the versioned accessions from MANE v1.5 / GRCh38.p14.
/// If a future MANE release bumps the patch version (e.g. `NC_000001.12`),
/// add it here alongside the existing entry rather than replacing it —
/// older cached GFF3 files should continue to resolve correctly.
const NC_TO_UCSC: &[(&str, &str)] = &[
    ("NC_000001.11", "chr1"),
    ("NC_000002.12", "chr2"),
    ("NC_000003.12", "chr3"),
    ("NC_000004.12", "chr4"),
    ("NC_000005.10", "chr5"),
    ("NC_000006.12", "chr6"),
    ("NC_000007.14", "chr7"),
    ("NC_000008.11", "chr8"),
    ("NC_000009.12", "chr9"),
    ("NC_000010.11", "chr10"),
    ("NC_000011.10", "chr11"),
    ("NC_000012.12", "chr12"),
    ("NC_000013.11", "chr13"),
    ("NC_000014.9", "chr14"),
    ("NC_000015.10", "chr15"),
    ("NC_000016.10", "chr16"),
    ("NC_000017.11", "chr17"),
    ("NC_000018.10", "chr18"),
    ("NC_000019.10", "chr19"),
    ("NC_000020.11", "chr20"),
    ("NC_000021.9", "chr21"),
    ("NC_000022.11", "chr22"),
    ("NC_000023.11", "chrX"),
    ("NC_000024.10", "chrY"),
    ("NC_012920.1", "chrM"),
];

/// Map a RefSeq chromosome accession to a UCSC-style name.
///
/// Inputs not in the hardcoded standard-chromosome table (including patch
/// sequences with prefixes `NW_*` or `NT_*`) are returned unchanged.
///
/// # Examples
///
/// ```
/// use vareffect::chrom::refseq_to_ucsc;
/// assert_eq!(refseq_to_ucsc("NC_000006.12"), "chr6");
/// assert_eq!(refseq_to_ucsc("NW_025791820.1"), "NW_025791820.1");
/// assert_eq!(refseq_to_ucsc("gibberish"), "gibberish");
/// ```
pub fn refseq_to_ucsc(acc: &str) -> &str {
    // Linear scan over 25 entries — a HashMap would be overkill for such a
    // small static table and would require a one-time allocation per
    // process. The inner loop happens once per mRNA/gene row during GFF3
    // ingest; even for 19k transcripts the cumulative cost is < 1 ms.
    for (key, value) in NC_TO_UCSC {
        if *key == acc {
            return value;
        }
    }
    acc
}

/// Map a UCSC-style chromosome name to a RefSeq accession.
///
/// Inverse of [`refseq_to_ucsc`] using the same 25-entry const table.
/// Inputs not in the table (patch sequences, UCSC-style patch names like
/// `chr9_KN196479v1_fix`, arbitrary strings) are returned unchanged — the
/// [`FastaReader`](crate::FastaReader) consults a runtime alias table for
/// patch-contig translation and relies on the pass-through for any
/// caller that already holds a RefSeq accession.
///
/// # Examples
///
/// ```
/// use vareffect::chrom::ucsc_to_refseq;
/// assert_eq!(ucsc_to_refseq("chr6"), "NC_000006.12");
/// assert_eq!(ucsc_to_refseq("chrM"), "NC_012920.1");
/// assert_eq!(ucsc_to_refseq("chr9_KN196479v1_fix"), "chr9_KN196479v1_fix");
/// assert_eq!(ucsc_to_refseq("NC_000001.11"), "NC_000001.11");
/// ```
pub fn ucsc_to_refseq(ucsc: &str) -> &str {
    // Symmetric linear scan over the same 25-entry table as `refseq_to_ucsc`.
    // Inverting via a HashMap would duplicate the table at startup for no
    // measurable benefit — the reverse lookup fires once per `FastaReader`
    // query (hot path is variant resolution, ~1k queries/s peak) and the
    // const-table scan is faster than a HashMap probe at that size.
    for (refseq, value) in NC_TO_UCSC {
        if *value == ucsc {
            return refseq;
        }
    }
    ucsc
}

/// Return `true` if `chrom` is a non-primary GRCh38 contig — either in the
/// RefSeq accession form (`NW_*`, `NT_*` — used by NCBI MANE summary TSVs)
/// or the UCSC alt/fix/random/unlocalized form (used by NCBI MANE GFF3
/// column 1).
///
/// These are parsed and retained in the store, but transcripts on them
/// cannot be looked up by a standard chromosome name like `chr6`.
///
/// Recognized UCSC patterns:
/// - `chr*_*_fix` — fix patches (e.g. `chr9_KN196479v1_fix`)
/// - `chr*_*_alt` — alternate haplotypes (e.g. `chr22_KI270879v1_alt`)
/// - `chr*_*_random` — unlocalized contigs (e.g. `chr1_KI270706v1_random`)
/// - `chrUn_*` — unplaced scaffolds (e.g. `chrUn_KI270302v1`)
pub fn is_patch_sequence(chrom: &str) -> bool {
    // RefSeq forms (used by MANE summary TSV and other NCBI products).
    if chrom.starts_with("NW_") || chrom.starts_with("NT_") {
        return true;
    }
    // UCSC forms (used by the MANE GFF3 column 1 for alt/fix contigs).
    chrom.ends_with("_alt")
        || chrom.ends_with("_fix")
        || chrom.ends_with("_random")
        || chrom.starts_with("chrUn_")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn maps_all_standard_chromosomes() {
        // Exhaustive round-trip of the 25 standard GRCh38 chromosomes.
        let expected = [
            ("NC_000001.11", "chr1"),
            ("NC_000002.12", "chr2"),
            ("NC_000003.12", "chr3"),
            ("NC_000004.12", "chr4"),
            ("NC_000005.10", "chr5"),
            ("NC_000006.12", "chr6"),
            ("NC_000007.14", "chr7"),
            ("NC_000008.11", "chr8"),
            ("NC_000009.12", "chr9"),
            ("NC_000010.11", "chr10"),
            ("NC_000011.10", "chr11"),
            ("NC_000012.12", "chr12"),
            ("NC_000013.11", "chr13"),
            ("NC_000014.9", "chr14"),
            ("NC_000015.10", "chr15"),
            ("NC_000016.10", "chr16"),
            ("NC_000017.11", "chr17"),
            ("NC_000018.10", "chr18"),
            ("NC_000019.10", "chr19"),
            ("NC_000020.11", "chr20"),
            ("NC_000021.9", "chr21"),
            ("NC_000022.11", "chr22"),
            ("NC_000023.11", "chrX"),
            ("NC_000024.10", "chrY"),
            ("NC_012920.1", "chrM"),
        ];
        for (acc, expected_ucsc) in expected {
            assert_eq!(
                refseq_to_ucsc(acc),
                expected_ucsc,
                "mapping failed for {acc}"
            );
        }
    }

    #[test]
    fn patch_sequences_round_trip_unchanged() {
        assert_eq!(refseq_to_ucsc("NW_025791820.1"), "NW_025791820.1");
        assert_eq!(refseq_to_ucsc("NT_187633.1"), "NT_187633.1");
    }

    #[test]
    fn unknown_accession_returned_as_is() {
        assert_eq!(refseq_to_ucsc("gibberish"), "gibberish");
        assert_eq!(refseq_to_ucsc(""), "");
    }

    #[test]
    fn ucsc_to_refseq_maps_all_standard_chromosomes() {
        // Mirror of `maps_all_standard_chromosomes` for the inverse direction.
        // Exhaustive so a drift between the two mapping functions is caught
        // at CI time rather than surfacing as a silent FASTA lookup miss.
        let expected = [
            ("chr1", "NC_000001.11"),
            ("chr2", "NC_000002.12"),
            ("chr3", "NC_000003.12"),
            ("chr4", "NC_000004.12"),
            ("chr5", "NC_000005.10"),
            ("chr6", "NC_000006.12"),
            ("chr7", "NC_000007.14"),
            ("chr8", "NC_000008.11"),
            ("chr9", "NC_000009.12"),
            ("chr10", "NC_000010.11"),
            ("chr11", "NC_000011.10"),
            ("chr12", "NC_000012.12"),
            ("chr13", "NC_000013.11"),
            ("chr14", "NC_000014.9"),
            ("chr15", "NC_000015.10"),
            ("chr16", "NC_000016.10"),
            ("chr17", "NC_000017.11"),
            ("chr18", "NC_000018.10"),
            ("chr19", "NC_000019.10"),
            ("chr20", "NC_000020.11"),
            ("chr21", "NC_000021.9"),
            ("chr22", "NC_000022.11"),
            ("chrX", "NC_000023.11"),
            ("chrY", "NC_000024.10"),
            ("chrM", "NC_012920.1"),
        ];
        for (ucsc, expected_refseq) in expected {
            assert_eq!(
                ucsc_to_refseq(ucsc),
                expected_refseq,
                "inverse mapping failed for {ucsc}"
            );
        }
    }

    #[test]
    fn ucsc_to_refseq_passes_through_patches_and_unknowns() {
        // UCSC-style patch names must pass through unchanged — the reverse
        // FastaReader alias table handles them at runtime, not this const
        // table.
        assert_eq!(ucsc_to_refseq("chr9_KN196479v1_fix"), "chr9_KN196479v1_fix");
        assert_eq!(
            ucsc_to_refseq("chr22_KI270879v1_alt"),
            "chr22_KI270879v1_alt"
        );
        assert_eq!(ucsc_to_refseq("chrUn_KI270302v1"), "chrUn_KI270302v1");
        // A RefSeq accession should round-trip to itself (not re-translate).
        assert_eq!(ucsc_to_refseq("NC_000001.11"), "NC_000001.11");
        assert_eq!(ucsc_to_refseq("NW_025791820.1"), "NW_025791820.1");
        // Arbitrary strings — treated as pass-through like refseq_to_ucsc.
        assert_eq!(ucsc_to_refseq("gibberish"), "gibberish");
        assert_eq!(ucsc_to_refseq(""), "");
    }

    #[test]
    fn ucsc_refseq_round_trip_identity_on_primary_chromosomes() {
        // `refseq_to_ucsc(ucsc_to_refseq(x)) == x` for every UCSC primary
        // chrom. This is the guard against either function silently
        // dropping an entry from NC_TO_UCSC.
        for (_refseq, ucsc) in NC_TO_UCSC {
            assert_eq!(
                refseq_to_ucsc(ucsc_to_refseq(ucsc)),
                *ucsc,
                "round-trip failed for {ucsc}"
            );
        }
    }

    #[test]
    fn is_patch_sequence_detects_prefixes() {
        assert!(is_patch_sequence("NW_025791820.1"));
        assert!(is_patch_sequence("NT_187633.1"));
        assert!(!is_patch_sequence("NC_000001.11"));
        assert!(!is_patch_sequence("chr1"));
        assert!(!is_patch_sequence(""));
    }

    #[test]
    fn is_patch_sequence_detects_ucsc_alt_fix_contigs() {
        // UCSC-style contig names as they appear in column 1 of the real
        // MANE v1.5 GFF3 file.
        assert!(is_patch_sequence("chr9_KN196479v1_fix"));
        assert!(is_patch_sequence("chr22_KI270879v1_alt"));
        assert!(is_patch_sequence("chr1_KI270706v1_random"));
        assert!(is_patch_sequence("chrUn_KI270302v1"));
        // Primary chromosomes must NOT be flagged.
        assert!(!is_patch_sequence("chr1"));
        assert!(!is_patch_sequence("chrX"));
        assert!(!is_patch_sequence("chrY"));
        assert!(!is_patch_sequence("chrM"));
    }
}