use std::io::Write;
use crate::error::{GzippyError, GzippyResult};
const MAGIC: &[u8] = b"GZIDX\x01";
const WINDOW_SIZE: usize = 32768;
#[derive(Clone)]
pub struct IndexPoint {
pub compressed_bit_offset: u64,
pub uncompressed_offset: u64,
pub window: [u8; WINDOW_SIZE],
}
pub struct SeekIndex {
pub points: Vec<IndexPoint>,
pub total_uncompressed_size: u64,
pub deflate_offset: usize,
}
pub fn build_index(gzip_data: &[u8], interval_bytes: usize) -> GzippyResult<SeekIndex> {
if gzip_data.len() < 2 || gzip_data[0] != 0x1f || gzip_data[1] != 0x8b {
return Err(GzippyError::invalid_argument("Not a gzip file"));
}
if crate::decompress::format::is_likely_multi_member(gzip_data) {
return Err(GzippyError::invalid_argument(
"Multi-member gzip files are not yet supported for indexing",
));
}
if crate::decompress::format::has_bgzf_markers(gzip_data) {
return Err(GzippyError::invalid_argument(
"gzippy-parallel files (BGZF format) have built-in block boundaries; \
use BGZF block headers directly instead of this index",
));
}
let deflate_offset = crate::decompress::format::parse_gzip_header_size(gzip_data)
.ok_or_else(|| GzippyError::invalid_argument("Invalid gzip header"))?;
if deflate_offset >= gzip_data.len() {
return Err(GzippyError::invalid_argument(
"Gzip header extends past file end",
));
}
let deflate_data = &gzip_data[deflate_offset..];
let scan_result = crate::decompress::scan_inflate::scan_deflate_fast(
deflate_data,
interval_bytes,
0, )
.map_err(|e| GzippyError::decompression(format!("Scan failed: {}", e)))?;
let mut points = Vec::new();
for checkpoint in scan_result.checkpoints {
let input_bits = (checkpoint.input_byte_pos as u64).saturating_mul(8);
let compressed_bit_offset = input_bits.saturating_sub(checkpoint.bitsleft as u64);
let mut window = [0u8; WINDOW_SIZE];
let window_len = checkpoint.window.len().min(WINDOW_SIZE);
window[..window_len].copy_from_slice(&checkpoint.window[..window_len]);
points.push(IndexPoint {
compressed_bit_offset,
uncompressed_offset: checkpoint.output_offset as u64,
window,
});
}
if points.is_empty() {
points.push(IndexPoint {
compressed_bit_offset: 0,
uncompressed_offset: 0,
window: [0u8; WINDOW_SIZE],
});
}
Ok(SeekIndex {
points,
total_uncompressed_size: scan_result.total_output_size as u64,
deflate_offset,
})
}
pub fn serialize_index(index: &SeekIndex, writer: &mut dyn Write) -> GzippyResult<()> {
writer.write_all(MAGIC).map_err(GzippyError::Io)?;
writer
.write_all(&(index.deflate_offset as u32).to_le_bytes())
.map_err(GzippyError::Io)?;
writer
.write_all(&index.total_uncompressed_size.to_le_bytes())
.map_err(GzippyError::Io)?;
writer
.write_all(&(index.points.len() as u32).to_le_bytes())
.map_err(GzippyError::Io)?;
writer
.write_all(&[0u8; 2]) .map_err(GzippyError::Io)?;
for point in &index.points {
writer
.write_all(&point.compressed_bit_offset.to_le_bytes())
.map_err(GzippyError::Io)?;
writer
.write_all(&point.uncompressed_offset.to_le_bytes())
.map_err(GzippyError::Io)?;
writer.write_all(&point.window).map_err(GzippyError::Io)?;
}
Ok(())
}
pub fn load_index(data: &[u8]) -> GzippyResult<SeekIndex> {
if data.len() < MAGIC.len() + 4 + 8 + 4 + 2 {
return Err(GzippyError::parse("Index file too small"));
}
if &data[..MAGIC.len()] != MAGIC {
return Err(GzippyError::parse("Invalid index file magic"));
}
let mut offset = MAGIC.len();
let deflate_offset = u32::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
]) as usize;
offset += 4;
let total_uncompressed_size = u64::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
offset += 8;
let point_count = u32::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
]) as usize;
offset += 4;
offset += 2;
let point_size = 8 + 8 + WINDOW_SIZE;
let expected_size = offset + point_count * point_size;
if data.len() < expected_size {
return Err(GzippyError::parse(format!(
"Index file truncated: expected {} bytes, got {}",
expected_size,
data.len()
)));
}
let mut points = Vec::new();
for _ in 0..point_count {
let compressed_bit_offset = u64::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
offset += 8;
let uncompressed_offset = u64::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
offset += 8;
let mut window = [0u8; WINDOW_SIZE];
window.copy_from_slice(&data[offset..offset + WINDOW_SIZE]);
offset += WINDOW_SIZE;
points.push(IndexPoint {
compressed_bit_offset,
uncompressed_offset,
window,
});
}
for i in 1..points.len() {
if points[i].uncompressed_offset <= points[i - 1].uncompressed_offset {
return Err(GzippyError::parse(
"Index points are not monotonically increasing",
));
}
}
Ok(SeekIndex {
points,
total_uncompressed_size,
deflate_offset,
})
}
pub fn seek_decompress<W: Write>(
gzip_data: &[u8],
index: &SeekIndex,
uncompressed_offset: u64,
max_bytes: u64,
writer: &mut W,
) -> GzippyResult<u64> {
if gzip_data.len() < 2 || gzip_data[0] != 0x1f || gzip_data[1] != 0x8b {
return Err(GzippyError::invalid_argument("Not a gzip file"));
}
if uncompressed_offset == 0 {
if index.points.is_empty() {
return Err(GzippyError::invalid_argument("No checkpoints in index"));
}
let checkpoint = &index.points[0];
if checkpoint.uncompressed_offset != 0 {
return Err(GzippyError::invalid_argument(format!(
"Cannot seek to 0; earliest checkpoint is at {}",
checkpoint.uncompressed_offset
)));
}
}
let checkpoint_idx = index
.points
.binary_search_by_key(&uncompressed_offset, |p| p.uncompressed_offset)
.unwrap_or_else(|idx| {
if idx == 0 {
return 0; }
idx - 1
});
if checkpoint_idx >= index.points.len() {
return Err(GzippyError::invalid_argument(format!(
"Requested offset {} is beyond uncompressed size {}",
uncompressed_offset, index.total_uncompressed_size
)));
}
let checkpoint = &index.points[checkpoint_idx];
if checkpoint.uncompressed_offset > uncompressed_offset {
return Err(GzippyError::invalid_argument(format!(
"Seek offset {} is before earliest checkpoint at {}",
uncompressed_offset, checkpoint.uncompressed_offset
)));
}
let skip_bytes = (uncompressed_offset - checkpoint.uncompressed_offset) as usize;
let deflate_data = &gzip_data[index.deflate_offset..];
let max_output =
skip_bytes.saturating_add(max_bytes.min(u64::MAX - skip_bytes as u64) as usize);
let max_output = max_output.max(skip_bytes + 1);
let (output, _) = crate::backends::inflate_bit::decompress_deflate_from_bit_with_end(
deflate_data,
checkpoint.compressed_bit_offset as usize,
&checkpoint.window,
max_output,
)
.ok_or_else(|| GzippyError::decompression("Failed to decompress from checkpoint"))?;
let remaining = if skip_bytes < output.len() {
&output[skip_bytes..]
} else {
&[]
};
let to_write = remaining.len().min(max_bytes as usize);
writer.write_all(&remaining[..to_write])?;
writer.flush()?;
Ok(to_write as u64)
}