use std::path::Path;
use tracing::instrument;
#[non_exhaustive]
#[derive(Debug, thiserror::Error)]
pub enum GziError {
#[error("I/O error reading GZI index {path}")]
Read { path: std::path::PathBuf, source: std::io::Error },
#[error("GZI index {path} is too short ({actual} bytes, need at least 8 for the entry count)")]
TooShort { path: std::path::PathBuf, actual: usize },
#[error("GZI index {path}: entry count {count} causes size overflow")]
SizeOverflow { path: std::path::PathBuf, count: u64 },
#[error("GZI index {path}: expected {expected} bytes for {count} entries, got {actual}")]
WrongSize { path: std::path::PathBuf, count: u64, expected: u64, actual: usize },
#[error("GZI index {path}: entry {index} is truncated")]
TruncatedEntry { path: std::path::PathBuf, index: usize },
#[error(
"GZI index {path}: entries are not sorted at entry {index} (uncompressed offset {offset} <= previous {prev_offset})"
)]
UnsortedEntries { path: std::path::PathBuf, index: usize, offset: u64, prev_offset: u64 },
#[error(
"GZI within-block offset {within_block} exceeds BGZF max block size (65535) — GZI index may be corrupt or missing entries"
)]
WithinBlockOverflow { within_block: u64 },
#[error("internal GZI error: binary search returned out-of-bounds index {index}")]
InternalBoundsError { index: usize },
}
#[derive(Debug, Clone, Copy)]
struct GziEntry {
compressed_offset: u64,
uncompressed_offset: u64,
}
#[derive(Debug, Clone)]
pub struct GziIndex {
entries: Vec<GziEntry>,
}
impl GziIndex {
#[instrument(level = "debug")]
pub fn from_file(path: &Path) -> Result<Self, GziError> {
let data = std::fs::read(path)
.map_err(|source| GziError::Read { path: path.to_path_buf(), source })?;
Self::parse(&data, path)
}
fn parse(data: &[u8], path: &Path) -> Result<Self, GziError> {
if data.len() < 8 {
return Err(GziError::TooShort { path: path.to_path_buf(), actual: data.len() });
}
let count =
u64::from_le_bytes(data.get(..8).and_then(|s| s.try_into().ok()).ok_or_else(|| {
GziError::TooShort { path: path.to_path_buf(), actual: data.len() }
})?);
let expected_size = count
.checked_mul(16)
.and_then(|n| n.checked_add(8))
.ok_or_else(|| GziError::SizeOverflow { path: path.to_path_buf(), count })?;
if data.len() as u64 != expected_size {
return Err(GziError::WrongSize {
path: path.to_path_buf(),
count,
expected: expected_size,
actual: data.len(),
});
}
#[expect(
clippy::cast_possible_truncation,
reason = "count * 16 + 8 == data.len() which fits in usize, so count fits in usize"
)]
let count_usize = count as usize;
let mut entries: Vec<GziEntry> = Vec::with_capacity(count_usize);
for i in 0..count_usize {
let base = i
.checked_mul(16)
.and_then(|n| n.checked_add(8))
.ok_or_else(|| GziError::TruncatedEntry { path: path.to_path_buf(), index: i })?;
let compressed_offset = u64::from_le_bytes(
data.get(
base..base.checked_add(8).ok_or_else(|| GziError::TruncatedEntry {
path: path.to_path_buf(),
index: i,
})?,
)
.and_then(|s| s.try_into().ok())
.ok_or_else(|| GziError::TruncatedEntry { path: path.to_path_buf(), index: i })?,
);
let base8 = base
.checked_add(8)
.ok_or_else(|| GziError::TruncatedEntry { path: path.to_path_buf(), index: i })?;
let uncompressed_offset = u64::from_le_bytes(
data.get(
base8..base8.checked_add(8).ok_or_else(|| GziError::TruncatedEntry {
path: path.to_path_buf(),
index: i,
})?,
)
.and_then(|s| s.try_into().ok())
.ok_or_else(|| GziError::TruncatedEntry { path: path.to_path_buf(), index: i })?,
);
if let Some(prev) = entries.last()
&& uncompressed_offset <= prev.uncompressed_offset
{
return Err(GziError::UnsortedEntries {
path: path.to_path_buf(),
index: i,
offset: uncompressed_offset,
prev_offset: prev.uncompressed_offset,
});
}
entries.push(GziEntry { compressed_offset, uncompressed_offset });
}
Ok(GziIndex { entries })
}
#[doc(hidden)]
pub fn parse_test(data: &[u8]) -> Self {
Self::parse(data, Path::new("<test>")).expect("test GZI data should be valid")
}
#[cfg(feature = "fuzz")]
pub fn from_bytes(data: &[u8]) -> Result<Self, GziError> {
Self::parse(data, Path::new("<fuzz>"))
}
pub fn translate(&self, uncompressed_offset: u64) -> Result<BlockLocation, GziError> {
let (compressed_offset, within_block) = match self
.entries
.binary_search_by_key(&uncompressed_offset, |e| e.uncompressed_offset)
{
Ok(i) => {
let e = self.entries.get(i).ok_or(GziError::InternalBoundsError { index: i })?;
(e.compressed_offset, 0u64)
}
Err(0) => (0, uncompressed_offset),
Err(i) => {
let prev = i.checked_sub(1).ok_or(GziError::InternalBoundsError { index: i })?;
let e =
self.entries.get(prev).ok_or(GziError::InternalBoundsError { index: prev })?;
let within = uncompressed_offset
.checked_sub(e.uncompressed_offset)
.ok_or(GziError::InternalBoundsError { index: prev })?;
(e.compressed_offset, within)
}
};
let within_block_offset = u16::try_from(within_block)
.map_err(|_| GziError::WithinBlockOverflow { within_block })?;
Ok(BlockLocation { compressed_offset, within_block_offset })
}
}
#[derive(Debug, Clone, Copy)]
pub struct BlockLocation {
pub compressed_offset: u64,
pub within_block_offset: u16,
}
#[cfg(test)]
#[allow(clippy::arithmetic_side_effects, reason = "test arithmetic is not safety-critical")]
mod tests {
use super::*;
use std::path::PathBuf;
fn test_path() -> PathBuf {
PathBuf::from("test.gzi")
}
fn make_gzi_data(entries: &[(u64, u64)]) -> Vec<u8> {
let mut data = Vec::new();
data.extend_from_slice(&(entries.len() as u64).to_le_bytes());
for &(compressed, uncompressed) in entries {
data.extend_from_slice(&compressed.to_le_bytes());
data.extend_from_slice(&uncompressed.to_le_bytes());
}
data
}
#[test]
fn parse_empty_gzi() {
let data = make_gzi_data(&[]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
assert!(gzi.entries.is_empty());
}
#[test]
fn parse_gzi_with_entries() {
let data = make_gzi_data(&[(65536, 65280), (131072, 130560)]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
assert_eq!(gzi.entries.len(), 2);
assert_eq!(gzi.entries[0].compressed_offset, 65536);
assert_eq!(gzi.entries[0].uncompressed_offset, 65280);
}
#[test]
fn parse_truncated_gzi() {
let err = GziIndex::parse(&[0; 4], &test_path()).unwrap_err();
assert!(matches!(err, GziError::TooShort { .. }));
}
#[test]
fn parse_wrong_size_gzi() {
let mut data = make_gzi_data(&[(100, 200)]);
data[..8].copy_from_slice(&2u64.to_le_bytes());
let err = GziIndex::parse(&data, &test_path()).unwrap_err();
assert!(matches!(err, GziError::WrongSize { .. }));
}
#[test]
fn translate_before_first_entry() {
let data = make_gzi_data(&[(65536, 65280)]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
let loc = gzi.translate(100).unwrap();
assert_eq!(loc.compressed_offset, 0);
assert_eq!(loc.within_block_offset, 100);
}
#[test]
fn translate_exact_match() {
let data = make_gzi_data(&[(65536, 65280), (131072, 130560)]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
let loc = gzi.translate(65280).unwrap();
assert_eq!(loc.compressed_offset, 65536);
assert_eq!(loc.within_block_offset, 0);
}
#[test]
fn translate_between_entries() {
let data = make_gzi_data(&[(65536, 65280), (131072, 130560)]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
let loc = gzi.translate(70000).unwrap();
assert_eq!(loc.compressed_offset, 65536);
assert_eq!(loc.within_block_offset, 4720u16);
}
#[test]
fn translate_after_last_entry() {
let data = make_gzi_data(&[(65536, 65280)]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
let loc = gzi.translate(100000).unwrap();
assert_eq!(loc.compressed_offset, 65536);
assert_eq!(loc.within_block_offset, 34720u16);
}
#[test]
fn translate_empty_gzi() {
let data = make_gzi_data(&[]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
let loc = gzi.translate(12345).unwrap();
assert_eq!(loc.compressed_offset, 0);
assert_eq!(loc.within_block_offset, 12345);
}
#[test]
fn translate_rejects_within_block_exceeding_u16() {
let data = make_gzi_data(&[]);
let gzi = GziIndex::parse(&data, &test_path()).unwrap();
let loc = gzi.translate(65535).unwrap();
assert_eq!(loc.within_block_offset, 65535);
let err = gzi.translate(65536).unwrap_err();
assert!(matches!(err, GziError::WithinBlockOverflow { .. }));
}
#[test]
fn reject_unsorted_entries() {
let data = make_gzi_data(&[(100, 200), (300, 100)]);
let err = GziIndex::parse(&data, &test_path()).unwrap_err();
assert!(
matches!(err, GziError::UnsortedEntries { .. }),
"expected UnsortedEntries, got {err:?}"
);
}
#[test]
fn reject_duplicate_uncompressed_offsets() {
let data = make_gzi_data(&[(100, 200), (300, 200)]);
let err = GziIndex::parse(&data, &test_path()).unwrap_err();
assert!(matches!(err, GziError::UnsortedEntries { .. }));
}
#[test]
fn reject_overflow_count() {
let mut data = vec![0u8; 8];
data[..8].copy_from_slice(&u64::MAX.to_le_bytes());
let err = GziIndex::parse(&data, &test_path()).unwrap_err();
assert!(matches!(err, GziError::SizeOverflow { .. }), "expected SizeOverflow, got {err:?}");
}
}