ferro-hgvs 0.6.0

HGVS variant normalizer - part of the ferro bioinformatics toolkit
Documentation
//! 3'/5' shuffling algorithm
//!
//! Implementation of the variant shuffling algorithm for normalization.
//!
//! # Coordinate System
//!
//! This module uses **0-based half-open intervals** internally:
//!
//! | Parameter | Basis | Notes |
//! |-----------|-------|-------|
//! | `start` | 0-based | Inclusive start position |
//! | `end` | 0-based | Exclusive end position |
//! | `boundaries.left` | 0-based | Inclusive left limit |
//! | `boundaries.right` | 0-based | Exclusive right limit |
//!
//! Callers must convert from 1-based HGVS positions before calling `shuffle()`.
//! Use `hgvs_pos_to_index()` from [`crate::coords`] for this conversion.

use crate::normalize::boundary::Boundaries;
use crate::normalize::config::ShuffleDirection;

/// Result of a shuffle operation
///
/// All positions are 0-based, matching the input coordinate system.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ShuffleResult {
    /// New start position (0-based, inclusive)
    pub start: u64,
    /// New end position (0-based, exclusive)
    pub end: u64,
    /// Whether the variant was moved
    pub shifted: bool,
}

/// Shuffle a variant towards 3' or 5' end
///
/// This is the core algorithm for HGVS normalization.
/// For insertions and deletions in repetitive regions,
/// the variant position is shifted to the rightmost (3')
/// or leftmost (5') valid position.
///
/// # Arguments
///
/// * `ref_seq` - Reference sequence bytes
/// * `alt_seq` - Alternate sequence bytes (empty for deletions)
/// * `start` - 0-based start position
/// * `end` - 0-based end position (exclusive)
/// * `boundaries` - Shuffling boundaries
/// * `direction` - Direction to shuffle (3' or 5')
///
/// # Returns
///
/// New start and end positions after shuffling
pub fn shuffle(
    ref_seq: &[u8],
    alt_seq: &[u8],
    start: u64,
    end: u64,
    boundaries: &Boundaries,
    direction: ShuffleDirection,
) -> ShuffleResult {
    let mut new_start = start;
    let mut new_end = end;

    match direction {
        ShuffleDirection::ThreePrime => {
            // Shuffle right (3').
            //
            // HGVS rotation invariant for a deletion of length L over the
            // 0-based half-open window [s, s+L): advance to (s+1, s+L+1) iff
            // ref[s] == ref[s+L]. The deleted bases at (s+1, s+L+1) then
            // equal a 1-position rotation of ref[s, s+L), so the haplotype
            // is unchanged. Iterating to the fixed point yields the
            // spec-canonical 3'-most representation.
            while new_end < boundaries.right {
                let ref_idx = new_end as usize;
                if ref_idx >= ref_seq.len() {
                    break;
                }

                // For deletion: check if we can shift right
                if alt_seq.is_empty() {
                    // ref[del_start] == ref[del_end_exclusive] is the
                    // rotation predicate above.
                    let del_start_idx = new_start as usize;
                    if del_start_idx < ref_seq.len() && ref_seq[del_start_idx] == ref_seq[ref_idx] {
                        new_start += 1;
                        new_end += 1;
                    } else {
                        break;
                    }
                } else {
                    // For insertion: check if inserted sequence can shift
                    let alt_idx = ((new_end - start) % alt_seq.len() as u64) as usize;
                    if ref_seq[ref_idx] == alt_seq[alt_idx] {
                        new_start += 1;
                        new_end += 1;
                    } else {
                        break;
                    }
                }
            }
            // Fixed-point invariant: on loop exit while still strictly
            // inside the boundary and the reference, the rotation
            // predicate must be false. A regression that under-shifts
            // by one rotation trips this in debug builds.
            debug_assert!(
                new_end >= boundaries.right
                    || (new_end as usize) >= ref_seq.len()
                    || (alt_seq.is_empty()
                        && ref_seq[new_start as usize] != ref_seq[new_end as usize])
                    || (!alt_seq.is_empty()
                        && ref_seq[new_end as usize]
                            != alt_seq[((new_end - start) % alt_seq.len() as u64) as usize]),
                "3'-shuffle terminated before reaching the rotation fixed point",
            );
        }
        ShuffleDirection::FivePrime => {
            // Shuffle left (5')
            while new_start > boundaries.left {
                let check_idx = new_start - 1;
                let ref_idx = check_idx as usize;
                if ref_idx >= ref_seq.len() {
                    break;
                }

                // For deletion: check if we can shift left
                if alt_seq.is_empty() {
                    let del_end_idx = (new_end - 1) as usize;
                    if del_end_idx < ref_seq.len() && ref_seq[del_end_idx] == ref_seq[ref_idx] {
                        new_start -= 1;
                        new_end -= 1;
                    } else {
                        break;
                    }
                } else {
                    // For insertion: check if inserted sequence can shift
                    let alt_idx =
                        alt_seq.len() - 1 - ((start - new_start) % alt_seq.len() as u64) as usize;
                    if ref_idx < ref_seq.len() && ref_seq[ref_idx] == alt_seq[alt_idx] {
                        new_start -= 1;
                        new_end -= 1;
                    } else {
                        break;
                    }
                }
            }
        }
    }

    ShuffleResult {
        start: new_start,
        end: new_end,
        shifted: new_start != start,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_shuffle_deletion_3prime() {
        // Sequence: ATGGGGGCAT
        // Delete one G at position 3 (0-based): should shift to position 6
        let ref_seq = b"ATGGGGGCAT";
        let boundaries = Boundaries::new(0, 10);

        let result = shuffle(
            ref_seq,
            &[], // deletion
            3,   // start
            4,   // end (exclusive)
            &boundaries,
            ShuffleDirection::ThreePrime,
        );

        assert!(result.shifted);
        assert_eq!(result.start, 6);
        assert_eq!(result.end, 7);
    }

    #[test]
    fn test_shuffle_deletion_5prime() {
        // Sequence: ATGGGGGCAT
        // Delete one G at position 6 (0-based): should shift to position 2
        let ref_seq = b"ATGGGGGCAT";
        let boundaries = Boundaries::new(0, 10);

        let result = shuffle(
            ref_seq,
            &[], // deletion
            6,   // start
            7,   // end
            &boundaries,
            ShuffleDirection::FivePrime,
        );

        assert!(result.shifted);
        assert_eq!(result.start, 2);
        assert_eq!(result.end, 3);
    }

    #[test]
    fn test_no_shuffle_needed() {
        // Sequence: ATGCATGCAT
        // Delete T at position 2: no adjacent T's to shift to
        let ref_seq = b"ATGCATGCAT";
        let boundaries = Boundaries::new(0, 10);

        let result = shuffle(
            ref_seq,
            &[],
            2,
            3,
            &boundaries,
            ShuffleDirection::ThreePrime,
        );

        assert!(!result.shifted);
        assert_eq!(result.start, 2);
        assert_eq!(result.end, 3);
    }

    #[test]
    fn test_shuffle_respects_boundary() {
        // Sequence: ATGGGGGCAT
        // Delete G, but boundary prevents full shift
        let ref_seq = b"ATGGGGGCAT";
        let boundaries = Boundaries::new(0, 5); // Restrict right boundary

        let result = shuffle(
            ref_seq,
            &[],
            3,
            4,
            &boundaries,
            ShuffleDirection::ThreePrime,
        );

        assert!(result.shifted);
        assert_eq!(result.end, 5); // Stopped at boundary
    }
}