use crate::error::{Error, Result};
use flate2::read::DeflateDecoder;
use std::io::{Read, Seek, SeekFrom};
#[derive(Clone, Debug, Default)]
pub struct BgzfValidation {
pub is_valid_bgzf: bool,
pub block_count: Option<u64>,
pub total_uncompressed_size: Option<u64>,
}
#[derive(Clone, Debug, Default)]
pub struct BgzfVerification {
pub is_valid_bgzf: bool,
pub crc_valid: bool,
pub isize_valid: bool,
pub block_count: u64,
pub compressed_size: u64,
pub uncompressed_size: u64,
pub first_error_block: Option<u64>,
pub first_error: Option<String>,
}
const GZIP_MAGIC: [u8; 2] = [0x1f, 0x8b];
const BGZF_SUBFIELD_ID: [u8; 2] = [b'B', b'C'];
const FEXTRA_FLAG: u8 = 0x04;
const MIN_HEADER_SIZE: usize = 18;
pub fn is_bgzf<R: Read>(reader: &mut R) -> Result<bool> {
let mut header = [0u8; MIN_HEADER_SIZE];
match reader.read_exact(&mut header) {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
return Ok(false);
}
Err(e) => return Err(Error::Io(e)),
}
Ok(validate_bgzf_header(&header))
}
fn validate_bgzf_header(header: &[u8]) -> bool {
if header.len() < MIN_HEADER_SIZE {
return false;
}
if header[0..2] != GZIP_MAGIC {
return false;
}
if header[2] != 8 {
return false;
}
if header[3] & FEXTRA_FLAG == 0 {
return false;
}
let xlen = u16::from_le_bytes([header[10], header[11]]) as usize;
if xlen < 6 {
return false;
}
if header[12..14] != BGZF_SUBFIELD_ID {
return false;
}
let bc_len = u16::from_le_bytes([header[14], header[15]]);
if bc_len != 2 {
return false;
}
true
}
pub fn validate_bgzf_streaming<R: Read>(reader: &mut R) -> Result<BgzfValidation> {
validate_bgzf_impl(reader)
}
pub fn validate_bgzf_strict<R: Read + Seek>(reader: &mut R) -> Result<BgzfValidation> {
reader.seek(SeekFrom::Start(0))?;
let result = validate_bgzf_impl(reader)?;
reader.seek(SeekFrom::Start(0))?;
Ok(result)
}
fn validate_bgzf_impl<R: Read>(reader: &mut R) -> Result<BgzfValidation> {
let mut block_count: u64 = 0;
let mut total_uncompressed_size: u64 = 0;
loop {
let mut header = [0u8; MIN_HEADER_SIZE];
match reader.read_exact(&mut header) {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
if block_count == 0 {
return Ok(BgzfValidation {
is_valid_bgzf: false,
block_count: None,
total_uncompressed_size: None,
});
}
break;
}
Err(e) => return Err(Error::Io(e)),
}
if !validate_bgzf_header(&header) {
return Ok(BgzfValidation {
is_valid_bgzf: false,
block_count: Some(block_count),
total_uncompressed_size: Some(total_uncompressed_size),
});
}
let bsize = u16::from_le_bytes([header[16], header[17]]) as u64;
let block_size = bsize + 1;
let remaining = block_size.saturating_sub(MIN_HEADER_SIZE as u64);
if remaining < 8 {
return Ok(BgzfValidation {
is_valid_bgzf: false,
block_count: Some(block_count),
total_uncompressed_size: Some(total_uncompressed_size),
});
}
let skip_to_footer = remaining - 8;
if skip_to_footer > 0 {
std::io::copy(&mut reader.take(skip_to_footer), &mut std::io::sink())?;
}
let mut footer = [0u8; 8];
reader.read_exact(&mut footer)?;
let isize = u32::from_le_bytes([footer[4], footer[5], footer[6], footer[7]]);
total_uncompressed_size += isize as u64;
block_count += 1;
if isize == 0 && block_size == 28 {
break;
}
}
Ok(BgzfValidation {
is_valid_bgzf: true,
block_count: Some(block_count),
total_uncompressed_size: Some(total_uncompressed_size),
})
}
pub fn verify_bgzf<R: Read>(reader: &mut R) -> Result<BgzfVerification> {
let mut result = BgzfVerification {
is_valid_bgzf: true,
crc_valid: true,
isize_valid: true,
..Default::default()
};
loop {
let mut header = [0u8; MIN_HEADER_SIZE];
match reader.read_exact(&mut header) {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
if result.block_count == 0 {
result.is_valid_bgzf = false;
result.first_error = Some("Empty or truncated file".to_string());
}
break;
}
Err(e) => return Err(Error::Io(e)),
}
if !validate_bgzf_header(&header) {
result.is_valid_bgzf = false;
if result.first_error.is_none() {
result.first_error_block = Some(result.block_count);
result.first_error = Some("Invalid BGZF header".to_string());
}
break;
}
let bsize = u16::from_le_bytes([header[16], header[17]]) as usize;
let block_size = bsize + 1;
result.compressed_size += block_size as u64;
let compressed_data_size = block_size.saturating_sub(MIN_HEADER_SIZE + 8);
if compressed_data_size == 0 && block_size < MIN_HEADER_SIZE + 8 {
result.is_valid_bgzf = false;
if result.first_error.is_none() {
result.first_error_block = Some(result.block_count);
result.first_error = Some("Block too small".to_string());
}
break;
}
let mut compressed_data = vec![0u8; compressed_data_size];
if let Err(e) = reader.read_exact(&mut compressed_data) {
result.is_valid_bgzf = false;
if result.first_error.is_none() {
result.first_error_block = Some(result.block_count);
result.first_error = Some(format!("Failed to read block data: {}", e));
}
break;
}
let mut footer = [0u8; 8];
if let Err(e) = reader.read_exact(&mut footer) {
result.is_valid_bgzf = false;
if result.first_error.is_none() {
result.first_error_block = Some(result.block_count);
result.first_error = Some(format!("Failed to read footer: {}", e));
}
break;
}
let stored_crc = u32::from_le_bytes([footer[0], footer[1], footer[2], footer[3]]);
let stored_isize = u32::from_le_bytes([footer[4], footer[5], footer[6], footer[7]]);
let mut decompressed = Vec::new();
let mut decoder = DeflateDecoder::new(&compressed_data[..]);
if let Err(e) = decoder.read_to_end(&mut decompressed) {
result.is_valid_bgzf = false;
if result.first_error.is_none() {
result.first_error_block = Some(result.block_count);
result.first_error = Some(format!("Decompression failed: {}", e));
}
result.block_count += 1;
continue;
}
if decompressed.len() as u32 != stored_isize {
result.isize_valid = false;
if result.first_error.is_none() {
result.first_error_block = Some(result.block_count);
result.first_error = Some(format!(
"ISIZE mismatch: stored {} but decompressed {} bytes",
stored_isize,
decompressed.len()
));
}
}
let computed_crc = crc32fast::hash(&decompressed);
if computed_crc != stored_crc {
result.crc_valid = false;
if result.first_error.is_none() {
result.first_error_block = Some(result.block_count);
result.first_error = Some(format!(
"CRC32 mismatch: stored {:08x} but computed {:08x}",
stored_crc, computed_crc
));
}
}
result.uncompressed_size += decompressed.len() as u64;
result.block_count += 1;
if stored_isize == 0 && block_size == 28 {
break;
}
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
const BGZF_EOF: [u8; 28] = [
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ];
#[test]
fn test_is_bgzf_with_eof_block() {
let mut cursor = Cursor::new(&BGZF_EOF);
assert!(is_bgzf(&mut cursor).unwrap());
}
#[test]
fn test_is_bgzf_with_plain_gzip() {
let plain_gzip = [
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, ];
let mut cursor = Cursor::new(&plain_gzip);
assert!(!is_bgzf(&mut cursor).unwrap());
}
#[test]
fn test_is_bgzf_with_empty_input() {
let mut cursor = Cursor::new(Vec::<u8>::new());
assert!(!is_bgzf(&mut cursor).unwrap());
}
#[test]
fn test_is_bgzf_with_random_data() {
let random = vec![0xde, 0xad, 0xbe, 0xef, 0x00, 0x01, 0x02, 0x03];
let mut cursor = Cursor::new(&random);
assert!(!is_bgzf(&mut cursor).unwrap());
}
#[test]
fn test_validate_strict_eof_only() {
let mut cursor = Cursor::new(&BGZF_EOF);
let result = validate_bgzf_strict(&mut cursor).unwrap();
assert!(result.is_valid_bgzf);
assert_eq!(result.block_count, Some(1));
assert_eq!(result.total_uncompressed_size, Some(0));
}
#[test]
fn test_validate_strict_plain_gzip() {
let plain_gzip = [
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
];
let mut cursor = Cursor::new(&plain_gzip);
let result = validate_bgzf_strict(&mut cursor).unwrap();
assert!(!result.is_valid_bgzf);
}
#[test]
fn test_validate_streaming_eof_only() {
let mut cursor = Cursor::new(&BGZF_EOF);
let result = validate_bgzf_streaming(&mut cursor).unwrap();
assert!(result.is_valid_bgzf);
assert_eq!(result.block_count, Some(1));
assert_eq!(result.total_uncompressed_size, Some(0));
}
#[test]
fn test_validate_streaming_plain_gzip() {
let plain_gzip = [
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
];
let mut cursor = Cursor::new(&plain_gzip);
let result = validate_bgzf_streaming(&mut cursor).unwrap();
assert!(!result.is_valid_bgzf);
}
}