use aho_corasick::AhoCorasick;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MarkerType {
Block,
Eos,
}
const MAGIC_BLOCK: u64 = 0x314159265359;
const MAGIC_EOS: u64 = 0x177245385090;
pub struct Scanner {
ac: AhoCorasick,
patterns_info: Vec<(u64, MarkerType, usize)>,
}
impl Scanner {
pub fn new() -> Self {
let mut patterns = Vec::new();
let mut patterns_info = Vec::new();
let magic_top = MAGIC_BLOCK << 16;
for shift in 0..8 {
let pattern_u64 = magic_top >> shift;
let pattern_bytes = pattern_u64.to_be_bytes();
let search_key = pattern_bytes[1..5].to_vec();
patterns.push(search_key);
patterns_info.push((MAGIC_BLOCK, MarkerType::Block, shift));
}
let magic_top = MAGIC_EOS << 16;
for shift in 0..8 {
let pattern_u64 = magic_top >> shift;
let pattern_bytes = pattern_u64.to_be_bytes();
let search_key = pattern_bytes[1..5].to_vec();
patterns.push(search_key);
patterns_info.push((MAGIC_EOS, MarkerType::Eos, shift));
}
let ac = AhoCorasick::new(patterns).unwrap();
Self { ac, patterns_info }
}
pub fn scan_stream(
&self,
data: &[u8],
base_offset_bits: u64,
sender: crossbeam_channel::Sender<(usize, Vec<(u64, MarkerType)>)>,
) {
let chunk_size = 1024 * 1024;
let overlap = 8;
let len = data.len();
let num_chunks = (len + chunk_size - 1) / chunk_size;
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(rayon::current_num_threads())
.build()
.unwrap();
pool.scope(|s| {
for i in 0..num_chunks {
let sender = sender.clone();
let start = i * chunk_size;
let end = std::cmp::min(start + chunk_size, len);
let scan_end = std::cmp::min(end + overlap, len);
let slice = &data[start..scan_end];
s.spawn(move |_| {
let mut local_markers = Vec::new();
for mat in self.ac.find_iter(slice) {
let pattern_id = mat.pattern();
let match_start = mat.start();
if match_start == 0 {
continue;
}
let start_byte_rel = match_start - 1;
if start_byte_rel >= (end - start) {
continue;
}
let (magic, mtype, shift) = self.patterns_info[pattern_id];
let rel_bit_offset = (start + start_byte_rel) as u64 * 8 + shift as u64;
if verify_magic(data, rel_bit_offset, magic) {
local_markers.push((base_offset_bits + rel_bit_offset, mtype));
}
}
let _ = sender.send((i, local_markers));
});
}
});
}
}
impl Default for Scanner {
fn default() -> Self {
Self::new()
}
}
pub fn extract_bits(data: &[u8], start_bit: u64, end_bit: u64, out: &mut Vec<u8>) {
if start_bit >= end_bit {
return;
}
let bit_len = end_bit - start_bit;
let byte_len = ((bit_len + 7) / 8) as usize;
out.reserve(byte_len);
let start_byte = (start_bit / 8) as usize;
let shift = (start_bit % 8) as u8;
if shift == 0 {
out.extend_from_slice(&data[start_byte..start_byte + byte_len]);
let last_bits = (bit_len % 8) as u8;
if last_bits > 0 {
let mask = 0xFFu8 << (8 - last_bits);
if let Some(last) = out.last_mut() {
*last &= mask;
}
}
} else {
let mut idx = start_byte;
let mut bits_left = bit_len;
while bits_left >= 64 {
if idx + 9 <= data.len() {
let bytes: [u8; 8] = data[idx..idx + 8].try_into().unwrap();
let val1 = u64::from_be_bytes(bytes);
let val2 = data[idx + 8] as u64;
let result = (val1 << shift) | (val2 >> (8 - shift));
out.extend_from_slice(&result.to_be_bytes());
idx += 8;
bits_left -= 64;
} else {
break; }
}
while bits_left >= 8 {
let b1 = data[idx];
let b2 = if idx + 1 < data.len() {
data[idx + 1]
} else {
0 };
let val = (b1 << shift) | (b2 >> (8 - shift));
out.push(val);
idx += 1;
bits_left -= 8;
}
if bits_left > 0 {
let b1 = data[idx];
let b2 = if idx + 1 < data.len() {
data[idx + 1]
} else {
0
};
let mut val = (b1 << shift) | (b2 >> (8 - shift));
let mask = 0xFFu8 << (8 - bits_left);
val &= mask;
out.push(val);
}
}
}
fn verify_magic(data: &[u8], bit_offset: u64, expected_magic: u64) -> bool {
let byte_idx = (bit_offset / 8) as usize;
let shift = (bit_offset % 8) as u8;
if byte_idx + 6 > data.len() {
return false;
}
let mut buf = [0u8; 8];
let len_to_read = std::cmp::min(8, data.len() - byte_idx);
buf[..len_to_read].copy_from_slice(&data[byte_idx..byte_idx + len_to_read]);
let val = u64::from_be_bytes(buf);
let magic_top = expected_magic << 16;
let expected = magic_top >> shift;
let mask = 0xFFFFFFFFFFFF0000 >> shift;
(val & mask) == expected
}
#[cfg(test)]
mod tests {
use super::*;
fn scan_to_vec(data: &[u8]) -> Vec<(u64, MarkerType)> {
let scanner = Scanner::new();
let (tx, rx) = crossbeam_channel::bounded(100);
let _pool = rayon::ThreadPoolBuilder::new()
.num_threads(1)
.build()
.unwrap();
std::thread::scope(|s| {
s.spawn(|| {
scanner.scan_stream(data, 0, tx);
});
});
let mut results = Vec::new();
for (_, markers) in rx {
results.extend(markers);
}
results.sort_by_key(|k| k.0);
results
}
#[test]
fn test_scanner_empty() {
let data = [];
let markers = scan_to_vec(&data);
assert!(markers.is_empty());
}
#[test]
fn test_scanner_single_block() {
let mut data = Vec::new();
data.extend_from_slice(&[0x31, 0x41, 0x59, 0x26, 0x53, 0x59]); data.extend_from_slice(b"some data");
let markers = scan_to_vec(&data);
assert_eq!(markers.len(), 1);
assert_eq!(markers[0].0, 0);
assert!(matches!(markers[0].1, MarkerType::Block));
}
#[test]
fn test_scanner_eos() {
let mut data = Vec::new();
data.extend_from_slice(&[0x17, 0x72, 0x45, 0x38, 0x50, 0x90]);
let markers = scan_to_vec(&data);
assert_eq!(markers.len(), 1);
assert_eq!(markers[0].0, 0);
assert!(matches!(markers[0].1, MarkerType::Eos));
}
#[test]
fn test_scanner_multiple_blocks() {
let mut data = Vec::new();
data.extend_from_slice(&[0x31, 0x41, 0x59, 0x26, 0x53, 0x59]);
data.extend_from_slice(b"data1");
let pos2 = data.len() as u64 * 8;
data.extend_from_slice(&[0x31, 0x41, 0x59, 0x26, 0x53, 0x59]);
data.extend_from_slice(b"data2");
let pos_eos = data.len() as u64 * 8;
data.extend_from_slice(&[0x17, 0x72, 0x45, 0x38, 0x50, 0x90]);
let markers = scan_to_vec(&data);
assert_eq!(markers.len(), 3);
assert_eq!(markers[0].0, 0);
assert!(matches!(markers[0].1, MarkerType::Block));
assert_eq!(markers[1].0, pos2);
assert!(matches!(markers[1].1, MarkerType::Block));
assert_eq!(markers[2].0, pos_eos);
assert!(matches!(markers[2].1, MarkerType::Eos));
}
#[test]
fn test_scanner_shifted() {
let magic: u64 = 0x314159265359;
let shift = 1;
let val = (magic << 16) >> shift;
let bytes = val.to_be_bytes();
let markers = scan_to_vec(&bytes);
assert_eq!(markers.len(), 1);
assert_eq!(markers[0].0, shift as u64);
assert!(matches!(markers[0].1, MarkerType::Block));
}
#[test]
fn test_extract_bits_aligned() {
let data = vec![0xAA, 0xBB, 0xCC];
let mut extracted = Vec::new();
extract_bits(&data, 8, 16, &mut extracted);
assert_eq!(extracted, vec![0xBB]);
}
#[test]
fn test_extract_bits_shifted() {
let data = vec![0xAA, 0xBB];
let mut extracted = Vec::new();
extract_bits(&data, 4, 12, &mut extracted);
assert_eq!(extracted, vec![0xAB]);
}
#[test]
fn test_extract_bits_partial() {
let data = vec![0xFF];
let mut extracted = Vec::new();
extract_bits(&data, 0, 4, &mut extracted);
assert_eq!(extracted, vec![0xF0]);
}
#[test]
fn test_extract_bits_u64_path() {
let data = vec![0xFF; 10];
let mut extracted = Vec::new();
extract_bits(&data, 4, 68, &mut extracted);
assert_eq!(extracted.len(), 8);
assert_eq!(extracted, vec![0xFF; 8]);
}
}