onelib 0.2.0

Rust implementation of the ONEcode file format
Documentation
//! LTF variable-length integer encoding.
//!
//! This is ONEcode's compact integer encoding, inspired by (but not
//! compatible with) the ITF8/LTF8 encoding used in CRAM/htslib.
//!
//! # Encoding scheme
//!
//! | Byte width | Positive range     | Prefix byte           |
//! |------------|--------------------|-----------------------|
//! | 1          | 0–63               | `01xxxxxx`            |
//! | 1          | −64–−1             | `11xxxxxx`            |
//! | 2          | 0–8 191            | `001xxxxx` + 1 byte   |
//! | 3–9        | positive, wider    | `0000_0ddd` + d bytes |
//! | 3–9        | negative, wider    | `1000_0ddd` + d bytes |
//!
//! Multi-byte payloads are stored in **little-endian** order. The `ddd` bits
//! in the prefix encode `d − 1`, where `d` is the number of payload bytes
//! (2–8).
//!
//! Note: the 2-byte negative case is not used — negative values that don't
//! fit in 1 byte go directly to 3+ bytes. This matches the C reference.

use std::io::{self, Read, Write};

/// Encode `value` into `buf`, returning the number of bytes written (1–9).
///
/// # Panics
///
/// Panics if `buf` is shorter than 9 bytes.
pub fn encode(value: i64, buf: &mut [u8]) -> usize {
    debug_assert!(buf.len() >= 9);

    if value >= 0 {
        if value & !0x3f == 0 {
            buf[0] = (value as u8) | 0x40;
            return 1;
        }
        if value & !0x1fff == 0 {
            buf[0] = ((value >> 8) as u8) | 0x20;
            buf[1] = (value & 0xff) as u8;
            return 2;
        }
        let le = value.to_le_bytes();
        let d = data_bytes_positive(value);
        buf[0] = (d - 1) as u8; // 0x01..0x07
        buf[1..1 + d].copy_from_slice(&le[..d]);
        1 + d
    } else {
        // Single-byte negative: all bits 6 and above are 1.
        if !value & !0x3f == 0 {
            buf[0] = (value as u8) | 0x40;
            return 1;
        }
        // 2-byte negative is not used in the C reference — skip to multi-byte.
        let le = value.to_le_bytes();
        let d = data_bytes_negative(value);
        buf[0] = 0x80 | (d - 1) as u8; // 0x81..0x87
        buf[1..1 + d].copy_from_slice(&le[..d]);
        1 + d
    }
}

/// Decode a value from `buf`, returning `(value, bytes_consumed)`.
///
/// # Panics
///
/// Panics if `buf` is too short for the encoded value.
pub fn decode(buf: &[u8]) -> (i64, usize) {
    match buf[0] >> 5 {
        // Single byte positive (bits 6-5 = 01 or 0b01x).
        2 | 3 => {
            let val = i64::from(buf[0] & 0x3f);
            (val, 1)
        }
        // Single byte negative (bits 6-5 = 11 or 0b11x).
        6 | 7 => {
            // Sign-extend from 8 bits.
            let val = (buf[0] as i8) as i64;
            (val, 1)
        }
        // Two bytes positive (bit pattern 001xxxxx).
        1 => {
            let val = (i64::from(buf[0] & 0x1f) << 8) | i64::from(buf[1]);
            (val, 2)
        }
        // Multi-byte positive.
        0 => {
            let d = (buf[0] & 0x07) as usize;
            assert!(d >= 1, "LTF decode: invalid byte count 0 in positive multi-byte");
            let val = read_le_masked(&buf[1..], d);
            (val, 1 + d + 1) // +1 for prefix, d+1 because d encodes (bytes-1)
        }
        // Multi-byte negative.
        4 => {
            let d = (buf[0] & 0x07) as usize;
            assert!(d >= 1, "LTF decode: invalid byte count 0 in negative multi-byte");
            let val = read_le_sign_extended(&buf[1..], d);
            (val, 1 + d + 1)
        }
        // Note: 5 is covered by the negative single-byte cases (6|7) due to
        // the 2-byte negative path being unused. The match is exhaustive for
        // 0..7, but the compiler needs this arm.
        _ => unreachable!("LTF decode: unexpected prefix {:#04x}", buf[0]),
    }
}

/// Write an LTF-encoded integer to `writer`, returning the number of bytes
/// written.
pub fn write(value: i64, writer: &mut impl Write) -> io::Result<usize> {
    let mut buf = [0u8; 9];
    let n = encode(value, &mut buf);
    writer.write_all(&buf[..n])?;
    Ok(n)
}

/// Read an LTF-encoded integer from `reader`.
pub fn read(reader: &mut impl Read) -> io::Result<i64> {
    let mut first = [0u8; 1];
    reader.read_exact(&mut first)?;

    let u0 = first[0];
    if u0 & 0x40 != 0 {
        // Single byte (positive or negative).
        if u0 & 0x80 != 0 {
            // Negative: sign-extend.
            Ok((u0 as i8) as i64)
        } else {
            Ok(i64::from(u0 & 0x3f))
        }
    } else if u0 & 0x20 != 0 {
        // Two bytes positive.
        let mut second = [0u8; 1];
        reader.read_exact(&mut second)?;
        Ok((i64::from(u0 & 0x1f) << 8) | i64::from(second[0]))
    } else {
        // Multi-byte: prefix encodes d-1 in low 3 bits, sign in bit 7.
        let d = ((u0 & 0x07) + 1) as usize;
        let negative = u0 & 0x80 != 0;
        let mut data = [0u8; 8];
        reader.read_exact(&mut data[..d])?;
        if negative {
            Ok(read_le_sign_extended(&data, d - 1))
        } else {
            Ok(read_le_masked(&data, d - 1))
        }
    }
}

// --- helpers ---

/// Determine the number of payload bytes needed for a positive value that
/// doesn't fit in 1 or 2 bytes.
fn data_bytes_positive(val: i64) -> usize {
    debug_assert!(val >= 0);
    if val & !0x0000_0000_0000_ffff == 0 { return 2; }
    if val & !0x0000_0000_00ff_ffff == 0 { return 3; }
    if val & !0x0000_0000_ffff_ffff == 0 { return 4; }
    if val & !0x0000_00ff_ffff_ffff == 0 { return 5; }
    if val & !0x0000_ffff_ffff_ffff == 0 { return 6; }
    if val & !0x00ff_ffff_ffff_ffff == 0 { return 7; }
    8
}

/// Determine the number of payload bytes needed for a negative value that
/// doesn't fit in 1 byte.
fn data_bytes_negative(val: i64) -> usize {
    debug_assert!(val < 0);
    if !val & !0x0000_0000_0000_ffff == 0 { return 2; }
    if !val & !0x0000_0000_00ff_ffff == 0 { return 3; }
    if !val & !0x0000_0000_ffff_ffff == 0 { return 4; }
    if !val & !0x0000_00ff_ffff_ffff == 0 { return 5; }
    if !val & !0x0000_ffff_ffff_ffff == 0 { return 6; }
    if !val & !0x00ff_ffff_ffff_ffff == 0 { return 7; }
    8
}

/// Read `d+1` little-endian bytes and zero-extend to i64 (positive values).
/// `d` is the (d-1) value from the prefix, so actual byte count is `d+1`.
fn read_le_masked(buf: &[u8], d: usize) -> i64 {
    let n = d + 1; // actual bytes
    let mut bytes = [0u8; 8];
    bytes[..n].copy_from_slice(&buf[..n]);
    i64::from_le_bytes(bytes)
}

/// Read `d+1` little-endian bytes and sign-extend to i64 (negative values).
fn read_le_sign_extended(buf: &[u8], d: usize) -> i64 {
    let n = d + 1;
    let mut bytes = [0xffu8; 8]; // fill with 0xff for sign extension
    bytes[..n].copy_from_slice(&buf[..n]);
    i64::from_le_bytes(bytes)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn single_byte_positive() {
        let mut buf = [0u8; 9];
        for v in 0..=63 {
            let n = encode(v, &mut buf);
            assert_eq!(n, 1, "value {v} should encode in 1 byte");
            let (decoded, consumed) = decode(&buf);
            assert_eq!(consumed, 1);
            assert_eq!(decoded, v, "round-trip failed for {v}");
        }
    }

    #[test]
    fn single_byte_negative() {
        let mut buf = [0u8; 9];
        for v in -64..=-1 {
            let n = encode(v, &mut buf);
            assert_eq!(n, 1, "value {v} should encode in 1 byte");
            let (decoded, consumed) = decode(&buf);
            assert_eq!(consumed, 1);
            assert_eq!(decoded, v, "round-trip failed for {v}");
        }
    }

    #[test]
    fn two_byte_positive() {
        let mut buf = [0u8; 9];
        for v in [64, 100, 255, 1000, 8191] {
            let n = encode(v, &mut buf);
            assert_eq!(n, 2, "value {v} should encode in 2 bytes");
            let (decoded, consumed) = decode(&buf);
            assert_eq!(consumed, 2);
            assert_eq!(decoded, v, "round-trip failed for {v}");
        }
    }

    #[test]
    fn boundary_values() {
        let mut buf = [0u8; 9];
        let cases = [
            (0i64, 1),
            (63, 1),
            (64, 2),
            (-1, 1),
            (-64, 1),
            (-65, 3),   // skips 2-byte negative
            (8191, 2),
            (8192, 3),
            (0xffff, 3),
            (0x1_0000, 4),
            (i64::MAX, 9),
            (i64::MIN, 9),
        ];
        for (val, expected_bytes) in cases {
            let n = encode(val, &mut buf);
            assert_eq!(n, expected_bytes, "value {val} should encode in {expected_bytes} bytes, got {n}");
            let (decoded, consumed) = decode(&buf);
            assert_eq!(consumed, expected_bytes);
            assert_eq!(decoded, val, "round-trip failed for {val}");
        }
    }

    #[test]
    fn read_write_round_trip() {
        let values = [0, 1, -1, 63, 64, -64, -65, 8191, 8192, -8193,
                      0xffff, 0x1_0000, -0x1_0000, i64::MAX, i64::MIN,
                      42, -42, 1000000, -1000000];

        let mut data = Vec::new();
        for &v in &values {
            write(v, &mut data).unwrap();
        }

        let mut cursor = std::io::Cursor::new(&data);
        for &expected in &values {
            let got = read(&mut cursor).unwrap();
            assert_eq!(got, expected, "stream round-trip failed for {expected}");
        }
    }
}