use crate::bits::{BitRead, SliceBitReader};
use crate::huffman::HuffmanDecoder;
pub struct BlockBoundary {
pub bit_offset: usize,
}
const MAX_PRECODE_LENGTH: u32 = 7;
const PRECODE_ALPHABET: [usize; 19] =
[16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15];
pub fn scan_for_block(data: &[u8], start_bit: usize, end_bit: usize) -> Option<BlockBoundary> {
let max_bit = end_bit.min(data.len().saturating_sub(10) * 8);
let mut bit_pos = start_bit;
while bit_pos < max_bit {
let skip = quick_reject_13bit(data, bit_pos);
if skip > 0 {
bit_pos += skip as usize;
continue;
}
if deep_validate(data, bit_pos) {
return Some(BlockBoundary { bit_offset: bit_pos });
}
bit_pos += 1;
}
None
}
#[inline(always)]
fn quick_reject_13bit(data: &[u8], bit_pos: usize) -> u8 {
let byte_pos = bit_pos / 8;
let bit_offset = bit_pos % 8;
if byte_pos + 2 >= data.len() {
return 1;
}
let b0 = data[byte_pos] as u32;
let b1 = data[byte_pos + 1] as u32;
let b2 = data[byte_pos + 2] as u32;
let raw = b0 | (b1 << 8) | (b2 << 16);
let bits = (raw >> bit_offset as u32) & 0x1FFF;
if (bits >> 1) & 3 != 2 {
return 1;
}
if (bits >> 3) & 0x1F > 29 {
return 1;
}
0
}
fn deep_validate(data: &[u8], bit_pos: usize) -> bool {
let byte_pos = bit_pos / 8;
let bit_offset = (bit_pos % 8) as u8;
if byte_pos + 12 >= data.len() {
return false;
}
let mut bits = SliceBitReader::new(data);
bits.set_bit_position(byte_pos, bit_offset);
if bits.read_bits(3).is_err() {
return false;
}
let hlit = match bits.read_bits(5) {
Ok(v) => v as usize + 257,
Err(_) => return false,
};
let hdist = match bits.read_bits(5) {
Ok(v) => v as usize + 1,
Err(_) => return false,
};
let hclen = match bits.read_bits(4) {
Ok(v) => v as usize + 4,
Err(_) => return false,
};
let mut precode_lengths = [0u8; 19];
for i in 0..hclen {
match bits.read_bits(3) {
Ok(v) => precode_lengths[PRECODE_ALPHABET[i]] = v as u8,
Err(_) => return false,
}
}
if !check_precode_leaf_count(&precode_lengths) {
return false;
}
let precode_decoder = match HuffmanDecoder::from_code_lengths(&precode_lengths) {
Ok(d) => d,
Err(_) => return false,
};
let total_codes = hlit + hdist;
let mut all_lengths = Vec::with_capacity(total_codes);
while all_lengths.len() < total_codes {
let sym = match precode_decoder.decode(&mut bits) {
Ok(s) => s,
Err(_) => return false,
};
match sym {
0..=15 => all_lengths.push(sym as u8),
16 => {
let repeat = match bits.read_bits(2) {
Ok(v) => v as usize + 3,
Err(_) => return false,
};
let prev = match all_lengths.last() {
Some(&v) => v,
None => return false, };
for _ in 0..repeat {
all_lengths.push(prev);
}
}
17 => {
let repeat = match bits.read_bits(3) {
Ok(v) => v as usize + 3,
Err(_) => return false,
};
all_lengths.resize(all_lengths.len() + repeat, 0);
}
18 => {
let repeat = match bits.read_bits(7) {
Ok(v) => v as usize + 11,
Err(_) => return false,
};
all_lengths.resize(all_lengths.len() + repeat, 0);
}
_ => return false,
}
if all_lengths.len() > total_codes {
return false;
}
}
if all_lengths.len() != total_codes {
return false;
}
let lit_lengths = &all_lengths[..hlit];
let dist_lengths = &all_lengths[hlit..];
if lit_lengths[256] == 0 {
return false;
}
if !check_huffman_code_lengths(lit_lengths, 15) {
return false;
}
if !dist_lengths.iter().all(|&l| l == 0) && !check_huffman_code_lengths(dist_lengths, 15) {
return false;
}
true
}
fn check_precode_leaf_count(lengths: &[u8; 19]) -> bool {
let mut leaf_count: u32 = 0;
for &cl in lengths {
if cl > 0 {
if cl as u32 > MAX_PRECODE_LENGTH {
return false;
}
leaf_count += 1 << (MAX_PRECODE_LENGTH - cl as u32);
}
}
leaf_count == 128 || leaf_count == 64
}
fn check_huffman_code_lengths(lengths: &[u8], max_code_length: u8) -> bool {
let mut leaf_count: u64 = 0;
for &cl in lengths {
if cl > 0 {
if cl > max_code_length {
return false;
}
leaf_count += 1u64 << (max_code_length - cl);
}
}
let full = 1u64 << max_code_length; let half = full >> 1;
if leaf_count == full {
return true; }
if leaf_count == half {
return lengths.iter().all(|&cl| cl <= 1);
}
false
}
#[inline(always)]
pub fn is_ascii_bioinformatics(byte: u8) -> bool {
(9..=13).contains(&byte) || (32..=126).contains(&byte)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ascii_validation() {
assert!(is_ascii_bioinformatics(9));
assert!(is_ascii_bioinformatics(10));
assert!(is_ascii_bioinformatics(13));
assert!(is_ascii_bioinformatics(32));
assert!(is_ascii_bioinformatics(b'A'));
assert!(is_ascii_bioinformatics(126));
assert!(!is_ascii_bioinformatics(0));
assert!(!is_ascii_bioinformatics(127));
assert!(!is_ascii_bioinformatics(255));
}
#[test]
fn test_precode_leaf_count_valid() {
let mut lens = [0u8; 19];
lens[0] = 1;
lens[1] = 1;
assert!(check_precode_leaf_count(&lens));
}
#[test]
fn test_precode_leaf_count_invalid() {
let mut lens = [0u8; 19];
lens[0] = 2;
assert!(!check_precode_leaf_count(&lens));
}
#[test]
fn test_precode_leaf_count_single_symbol() {
let mut lens = [0u8; 19];
lens[0] = 1;
assert!(check_precode_leaf_count(&lens));
}
#[test]
fn test_check_huffman_code_lengths_valid() {
assert!(check_huffman_code_lengths(&[1, 1], 15));
}
#[test]
fn test_check_huffman_code_lengths_invalid() {
assert!(!check_huffman_code_lengths(&[2], 15));
}
#[test]
fn test_quick_reject_filters_non_dynamic() {
assert_eq!(quick_reject_13bit(&[0x05, 0, 0], 0), 0);
assert!(quick_reject_13bit(&[0x00, 0, 0], 0) > 0); assert!(quick_reject_13bit(&[0x02, 0, 0], 0) > 0); assert!(quick_reject_13bit(&[0x06, 0, 0], 0) > 0); }
#[test]
fn test_scan_finds_block_at_deflate_start() {
use std::io::Write;
let mut encoder =
flate2::write::DeflateEncoder::new(Vec::new(), flate2::Compression::default());
for i in 0..10_000 {
write!(encoder, "@read_{i}\nACGTACGTACGT\n+\nIIIIIIIIIIII\n").unwrap();
if i % 500 == 499 {
encoder.flush().unwrap();
}
}
let compressed = encoder.finish().unwrap();
let mut padded = compressed.clone();
padded.extend_from_slice(&[0u8; 16]);
let result = scan_for_block(&padded, 0, compressed.len() * 8);
assert!(result.is_some(), "Should find block in compressed FASTQ");
}
#[test]
fn test_scan_no_panic_on_random_data() {
let mut data = vec![0u8; 8192 + 16];
let mut state: u64 = 0xDEAD_BEEF_CAFE_BABEu64;
for byte in data.iter_mut().take(8192) {
state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
*byte = (state >> 33) as u8;
}
let _result = scan_for_block(&data, 0, 8192 * 8);
}
}