fgumi 0.1.3 - Docs.rs

//! BAM pipeline: Read → Decompress → `FindBoundaries` → Decode → Group → Process → Serialize → Compress → Write.
//!
//! This module implements the BAM-specific pipeline for processing BAM files
//! with grouping operations like `group` and `codec`.

use crossbeam_queue::ArrayQueue;
use noodles::bam::{self};
use noodles::sam::{Header, alignment::RecordBuf};
use parking_lot::Mutex;
use std::collections::VecDeque;
use std::fs::File;
use std::io::{self, BufReader, BufWriter, Read, Write};
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::thread;
use std::time::{Duration, Instant};

use crate::bam_io::is_stdout_path;
use crate::bgzf_reader::{BGZF_EOF, decompress_block_into, read_raw_blocks};
use crate::bgzf_writer::InlineBgzfCompressor;
use crate::progress::ProgressTracker;
use crate::reorder_buffer::ReorderBuffer;
use crate::sam::SamTag;
use noodles::sam::alignment::record::data::field::Tag;

use super::base::{
    ActiveSteps, BatchWeight, CompressedBlockBatch, DecodedRecord, DecompressedBatch,
    GroupKeyConfig, HasCompressor, HasHeldBoundaries, HasHeldCompressed, HasHeldProcessed,
    HasHeldSerialized, HasRecycledBuffers, HasWorkerCore, MemoryEstimate, MonitorableState,
    OutputPipelineQueues, OutputPipelineState, PROGRESS_LOG_INTERVAL, PipelineConfig,
    PipelineLifecycle, PipelineStats, PipelineStep, PipelineValidationError, ProcessPipelineState,
    QueueSample, RawBlockBatch, ReorderBufferState, SerializePipelineState, SerializedBatch,
    StepContext, WorkerCoreState, WorkerStateCommon, WritePipelineState, finalize_pipeline,
    generic_worker_loop, handle_worker_panic, join_monitor_thread, join_worker_threads,
    shared_try_step_compress,
};
use super::deadlock::{
    DeadlockAction, DeadlockConfig, DeadlockState, QueueSnapshot, check_deadlock_and_restore,
};
use super::scheduler::{BackpressureState, SchedulerStrategy};
use crate::read_info::{LibraryIndex, compute_group_key};
use crate::sort::bam_fields;

/// Buffer size for buffered I/O (8 MB).
/// This reduces syscalls by batching reads/writes into larger chunks.
const IO_BUFFER_SIZE: usize = 8 * 1024 * 1024;

/// Default target templates per batch for template-based batching.
/// Groups are accumulated until the total template count reaches this threshold.
/// This provides consistent batch sizes regardless of templates-per-group variation.
pub const DEFAULT_TARGET_TEMPLATES_PER_BATCH: usize = 500;

// ============================================================================
// Boundary Finding Types (for 8-step pipeline)
// ============================================================================

/// Output of `FindBoundaries` step: buffer + record offsets for parallel decoding.
///
/// This struct enables parallel BAM record decoding by pre-computing where
/// each record starts in the decompressed data. The actual parsing/decoding
/// can then be parallelized across multiple threads.
#[derive(Debug, Clone)]
pub struct BoundaryBatch {
    /// The decompressed bytes (with leftover prepended, suffix removed).
    pub buffer: Vec<u8>,
    /// Byte offsets where each record starts (offsets into buffer).
    /// Length = `num_records` + 1 (last entry is `buffer.len()` for easy slicing).
    pub offsets: Vec<usize>,
}

/// State for the `FindBoundaries` step (sequential).
///
/// This state maintains leftover bytes from incomplete records that span
/// across BGZF block boundaries. The boundary finding is very fast (~0.1μs
/// per block) since it only reads 4-byte integers without decoding records.
///
/// Uses a reusable work buffer to minimize allocations on the hot path.
pub struct BoundaryState {
    /// Leftover bytes from previous block (incomplete record at end).
    leftover: Vec<u8>,
    /// Reusable working buffer to avoid per-call allocations.
    work_buffer: Vec<u8>,
    /// Whether the BAM header has been skipped.
    header_skipped: bool,
}

impl BoundaryState {
    /// Create a new boundary state.
    #[must_use]
    pub fn new() -> Self {
        Self { leftover: Vec::new(), work_buffer: Vec::new(), header_skipped: false }
    }

    /// Create a new boundary state that doesn't skip the header.
    /// Use this when the input stream is already positioned past the header.
    #[must_use]
    pub fn new_no_header() -> Self {
        Self { leftover: Vec::new(), work_buffer: Vec::new(), header_skipped: true }
    }

    /// Parse BAM header and return the number of bytes consumed.
    /// Returns None if more data is needed.
    fn parse_header_size(data: &[u8]) -> Option<usize> {
        // BAM header structure:
        // - magic: 4 bytes ("BAM\1")
        // - l_text: 4 bytes (header text length)
        // - text: l_text bytes
        // - n_ref: 4 bytes (number of references)
        // - for each reference:
        //   - l_name: 4 bytes
        //   - name: l_name bytes
        //   - l_ref: 4 bytes

        if data.len() < 8 {
            return None;
        }

        // Check magic
        if &data[0..4] != bam_fields::BAM_MAGIC {
            // Not a valid BAM file, but let's not error here
            // Just return 0 so records start immediately
            return Some(0);
        }

        let l_text = u32::from_le_bytes([data[4], data[5], data[6], data[7]]) as usize;
        let mut offset = 8 + l_text;

        if data.len() < offset + 4 {
            return None;
        }

        let n_ref = u32::from_le_bytes([
            data[offset],
            data[offset + 1],
            data[offset + 2],
            data[offset + 3],
        ]) as usize;
        offset += 4;

        // Parse each reference
        for _ in 0..n_ref {
            if data.len() < offset + 4 {
                return None;
            }
            let l_name = u32::from_le_bytes([
                data[offset],
                data[offset + 1],
                data[offset + 2],
                data[offset + 3],
            ]) as usize;
            offset += 4 + l_name + 4; // l_name + name + l_ref

            if data.len() < offset {
                return None;
            }
        }

        Some(offset)
    }

    /// Find record boundaries in decompressed data.
    ///
    /// This is FAST (~0.1μs per block) because it only scans 4-byte integers
    /// to find where records start - no actual record decoding is performed.
    ///
    /// # Arguments
    ///
    /// * `decompressed` - Decompressed bytes from one or more BGZF blocks
    ///
    /// # Returns
    ///
    /// A `BoundaryBatch` containing the complete records and their offsets.
    /// Any incomplete record at the end is saved as leftover for the next call.
    ///
    /// # Errors
    ///
    /// Returns an I/O error if the BAM header is malformed.
    pub fn find_boundaries(&mut self, decompressed: &[u8]) -> io::Result<BoundaryBatch> {
        // Step 1: Combine leftover with new data into reusable work_buffer
        // This avoids allocating a new Vec on every call
        self.work_buffer.clear();
        if !self.leftover.is_empty() {
            self.work_buffer.append(&mut self.leftover);
        }
        self.work_buffer.extend_from_slice(decompressed);

        // Step 2: Skip header if not already done
        let mut cursor = 0usize;
        if !self.header_skipped {
            if let Some(header_size) = Self::parse_header_size(&self.work_buffer) {
                cursor = header_size;
                self.header_skipped = true;
            } else {
                // Not enough data to parse header, save as leftover and return empty batch
                std::mem::swap(&mut self.leftover, &mut self.work_buffer);
                return Ok(BoundaryBatch { buffer: Vec::new(), offsets: vec![0] });
            }
        }

        // Step 3: Scan for record boundaries (FAST - just read integers)
        let start_cursor = cursor;
        let mut offsets = vec![0usize]; // First offset is 0 (relative to start of records)

        while cursor + 4 <= self.work_buffer.len() {
            let block_size = u32::from_le_bytes([
                self.work_buffer[cursor],
                self.work_buffer[cursor + 1],
                self.work_buffer[cursor + 2],
                self.work_buffer[cursor + 3],
            ]) as usize;

            let record_end = cursor + 4 + block_size;
            if record_end > self.work_buffer.len() {
                break; // Incomplete record - becomes leftover
            }

            cursor = record_end;
            // Offset is relative to start of records (after header)
            offsets.push(cursor - start_cursor);
        }

        // Step 4: Save leftover for next block (reuse allocation)
        // Split work_buffer: [0..start_cursor | start_cursor..cursor | cursor..]
        //                     header (discard) | records (output)    | leftover
        self.leftover.clear();
        self.leftover.extend_from_slice(&self.work_buffer[cursor..]);

        // Extract the records buffer - this allocation is unavoidable as we return ownership
        let buffer = self.work_buffer[start_cursor..cursor].to_vec();

        // Validate: verify each record's block_size matches the offset difference
        #[cfg(debug_assertions)]
        for i in 0..offsets.len().saturating_sub(1) {
            let start = offsets[i];
            let end = offsets[i + 1];
            if end > start + 4 {
                let stored = u32::from_le_bytes([
                    buffer[start],
                    buffer[start + 1],
                    buffer[start + 2],
                    buffer[start + 3],
                ]) as usize;
                let expected = end - start - 4;
                debug_assert_eq!(
                    stored, expected,
                    "find_boundaries: block_size mismatch at record {i}: stored={stored}, expected={expected}"
                );
            }
        }

        Ok(BoundaryBatch { buffer, offsets })
    }

    /// Call at EOF to get any remaining leftover.
    ///
    /// This validates that any remaining bytes form complete records.
    /// If there are incomplete bytes at EOF, an error is returned.
    ///
    /// # Errors
    ///
    /// Returns an I/O error if there are incomplete BAM records at EOF.
    pub fn finish(&mut self) -> io::Result<Option<BoundaryBatch>> {
        if self.leftover.is_empty() {
            return Ok(None);
        }

        // Try to parse remaining leftover
        let mut offsets = vec![0usize];
        let mut cursor = 0usize;

        while cursor + 4 <= self.leftover.len() {
            let block_size = u32::from_le_bytes([
                self.leftover[cursor],
                self.leftover[cursor + 1],
                self.leftover[cursor + 2],
                self.leftover[cursor + 3],
            ]) as usize;

            let record_end = cursor + 4 + block_size;
            if record_end > self.leftover.len() {
                return Err(io::Error::new(
                    io::ErrorKind::UnexpectedEof,
                    format!(
                        "Incomplete BAM record at EOF: need {} bytes, have {}",
                        record_end - cursor,
                        self.leftover.len() - cursor
                    ),
                ));
            }

            cursor = record_end;
            offsets.push(cursor);
        }

        if cursor == 0 {
            return Ok(None);
        }

        Ok(Some(BoundaryBatch { buffer: std::mem::take(&mut self.leftover), offsets }))
    }
}

impl Default for BoundaryState {
    fn default() -> Self {
        Self::new()
    }
}

/// Decode BAM records from a boundary batch (parallel step).
///
/// This function takes pre-computed record boundaries and decodes the actual
/// BAM records. Since the boundaries are known, this can be called in parallel
/// on different batches.
///
/// # Arguments
///
/// * `batch` - A `BoundaryBatch` with record offsets
/// * `group_key_config` - Config for computing `GroupKey` (library index and cell tag)
///
/// # Returns
///
/// A vector of decoded `DecodedRecord` instances (record + pre-computed `GroupKey`).
///
/// # Errors
///
/// Returns an I/O error if any BAM record is malformed.
pub fn decode_records(
    batch: &BoundaryBatch,
    group_key_config: &GroupKeyConfig,
) -> io::Result<Vec<DecodedRecord>> {
    let num_records = batch.offsets.len().saturating_sub(1);
    let mut records = Vec::with_capacity(num_records);
    // read_record_buf requires a Header for reference name resolution and alignment parsing,
    // but this non-production path only uses integer reference IDs, so an empty header suffices.
    let header = noodles::sam::Header::default();

    for i in 0..num_records {
        let start = batch.offsets[i];
        let end = batch.offsets[i + 1];

        // Validate: end > start + 4 (need at least block_size prefix)
        if end <= start + 4 {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!(
                    "Invalid record bounds: start={start}, end={end}, record_index={i}, \
                     num_records={num_records}, buffer_len={}",
                    batch.buffer.len()
                ),
            ));
        }

        // Validate: block_size in buffer matches offset difference
        let stored_block_size = u32::from_le_bytes([
            batch.buffer[start],
            batch.buffer[start + 1],
            batch.buffer[start + 2],
            batch.buffer[start + 3],
        ]) as usize;
        let expected_block_size = end - start - 4;
        if stored_block_size != expected_block_size {
            return Err(io::Error::new(
                io::ErrorKind::InvalidData,
                format!(
                    "Block size mismatch: stored={stored_block_size}, expected={expected_block_size}, \
                     record_index={i}, start={start}, end={end}, buffer_len={}",
                    batch.buffer.len()
                ),
            ));
        }

        // Skip the 4-byte block_size prefix
        let record_data = &batch.buffer[start + 4..end];

        if group_key_config.raw_byte_mode {
            let raw = record_data.to_vec();
            if raw.len() < 32 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!("BAM record too short: len={}", raw.len()),
                ));
            }
            // Validate l_read_name fits within the record to prevent
            // panics in read_name() and downstream CIGAR/aux access.
            let l_rn = raw[8] as usize;
            if raw.len() < 32 + l_rn {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    format!(
                        "BAM record truncated: len={}, l_read_name={l_rn} (need >= {})",
                        raw.len(),
                        32 + l_rn
                    ),
                ));
            }
            let key = compute_group_key_from_raw(
                &raw,
                &group_key_config.library_index,
                group_key_config.cell_tag,
            );
            records.push(DecodedRecord::from_raw_bytes(raw, key));
        } else {
            // Use noodles' public Reader API for decoding, as recommended by the
            // noodles maintainer (see noodles#364). Reader::from accepts any R: Read
            // and read_record_buf reads the block_size prefix then decodes the record.
            let mut reader = noodles::bam::io::Reader::from(&batch.buffer[start..end]);
            let mut record = RecordBuf::default();
            reader.read_record_buf(&header, &mut record)?;

            let key = compute_group_key(
                &record,
                &group_key_config.library_index,
                group_key_config.cell_tag,
            );
            records.push(DecodedRecord::new(record, key));
        }
    }

    Ok(records)
}

/// Compute a `GroupKey` directly from raw BAM bytes, matching `compute_group_key()` exactly.
///
/// Uses 1-based coordinate helpers to produce identical keys to the noodles path.
fn compute_group_key_from_raw(
    raw: &[u8],
    library_index: &LibraryIndex,
    cell_tag: Option<noodles::sam::alignment::record::data::field::Tag>,
) -> super::base::GroupKey {
    use super::base::GroupKey;

    // Extract name hash (match noodles path: empty name → None → hash 0)
    let name = bam_fields::read_name(raw);
    let name_hash = if name.is_empty() {
        LibraryIndex::hash_name(None)
    } else {
        LibraryIndex::hash_name(Some(name))
    };

    // Check secondary/supplementary
    let flg = bam_fields::flags(raw);
    let is_secondary = (flg & bam_fields::flags::SECONDARY) != 0;
    let is_supplementary = (flg & bam_fields::flags::SUPPLEMENTARY) != 0;
    if is_secondary || is_supplementary {
        return GroupKey { name_hash, ..GroupKey::default() };
    }

    // Own position (1-based, matching noodles) — zero-allocation CIGAR iteration
    let reverse = (flg & bam_fields::flags::REVERSE) != 0;
    let own_pos = bam_fields::unclipped_5prime_from_raw_bam(raw);

    let own_ref_id = bam_fields::ref_id(raw);
    let strand = u8::from(reverse);

    // Single-pass aux tag extraction (RG, cell barcode, MC)
    let aux_data = bam_fields::aux_data_slice(raw);
    let cell_tag_bytes = cell_tag.map_or([0u8; 2], |t| [t.as_ref()[0], t.as_ref()[1]]);
    let aux_tags = bam_fields::extract_aux_string_tags(aux_data, &cell_tag_bytes);

    let library_idx = if let Some(rg) = aux_tags.rg {
        let rg_hash = LibraryIndex::hash_rg(rg);
        library_index.get(rg_hash)
    } else {
        0
    };

    let cell_hash =
        if let Some(cb) = aux_tags.cell { LibraryIndex::hash_cell_barcode(Some(cb)) } else { 0 };

    // Check if paired
    let is_paired = (flg & bam_fields::flags::PAIRED) != 0;
    if !is_paired {
        return GroupKey::single(own_ref_id, own_pos, strand, library_idx, cell_hash, name_hash);
    }

    // Mate info — guard against MATE_UNMAPPED (matching noodles path)
    let mate_unmapped = (flg & bam_fields::flags::MATE_UNMAPPED) != 0;
    let mate_reverse = (flg & bam_fields::flags::MATE_REVERSE) != 0;
    let mate_strand = u8::from(mate_reverse);
    let raw_mate_ref_id = bam_fields::mate_ref_id(raw);
    let raw_mate_pos = bam_fields::mate_pos(raw);

    // Get mate unclipped 5' position via MC tag (skip if mate is unmapped)
    let mate_pos_result = if mate_unmapped {
        None
    } else {
        aux_tags
            .mc
            .map(|mc| bam_fields::mate_unclipped_5prime_1based(raw_mate_pos, mate_reverse, mc))
    };

    match mate_pos_result {
        Some(mp) => GroupKey::paired(
            own_ref_id,
            own_pos,
            strand,
            raw_mate_ref_id,
            mp,
            mate_strand,
            library_idx,
            cell_hash,
            name_hash,
        ),
        None => {
            // No MC tag — fall back to single-end behavior
            GroupKey::single(own_ref_id, own_pos, strand, library_idx, cell_hash, name_hash)
        }
    }
}

// ============================================================================
// BAM Pipeline State
// ============================================================================

/// Shared state for the BAM pipeline.
///
/// Generic parameters:
/// - `G`: Group type (output of `Group` step, input to Process step)
/// - `P`: Processed type (output of Process step, input to Serialize step).
///   Must implement `MemoryEstimate` for queue memory tracking.
pub struct BamPipelineState<G, P: MemoryEstimate> {
    /// Pipeline configuration.
    pub config: PipelineConfig,

    // ========== Step 1: Read ==========
    /// Input file, mutex-protected for exclusive access.
    pub input_file: Mutex<Option<Box<dyn Read + Send>>>,
    /// Flag indicating EOF has been reached.
    pub read_done: AtomicBool,
    /// Next serial number to assign when reading.
    pub next_read_serial: AtomicU64,

    // ========== Queue 1: Read → Decompress ==========
    /// Raw BGZF blocks waiting to be decompressed.
    pub q1_raw_blocks: ArrayQueue<(u64, RawBlockBatch)>,
    /// Heap bytes currently held in Q1 (compressed BGZF blocks).
    #[cfg(feature = "memory-debug")]
    pub q1_heap_bytes: AtomicU64,

    // ========== Step 2: Decompress (parallel) ==========
    // No state needed - each thread has its own Decompressor
    /// Flag indicating all decompression is done.
    pub decompress_done: AtomicBool,
    /// Count of batches that have completed decompression (for completion tracking).
    pub batches_decompressed: AtomicU64,

    // ========== Queue 2: Decompress → FindBoundaries (with reorder) ==========
    /// Decompressed data waiting for boundary finding.
    pub q2_decompressed: ArrayQueue<(u64, DecompressedBatch)>,
    /// Reorder buffer to ensure step 3 receives data in order.
    pub q2_reorder: Mutex<ReorderBuffer<DecompressedBatch>>,

    // ========== Q2 Reorder Buffer Atomic State (for lock-free admission control) ==========
    /// Atomic state for Q2 reorder buffer (`next_seq` and `heap_bytes`).
    /// Used by Decompress and `FindBoundaries` steps for memory backpressure.
    pub q2_reorder_state: ReorderBufferState,

    // ========== Step 3: FindBoundaries (exclusive, FAST) ==========
    /// Boundary finding state, mutex-protected for exclusive access.
    pub boundary_state: Mutex<BoundaryState>,
    /// Flag indicating all boundaries have been found.
    pub boundary_done: AtomicBool,
    /// Next serial number for boundary batches.
    pub next_boundary_serial: AtomicU64,
    /// Count of batches that have completed boundary finding (for completion tracking).
    pub batches_boundary_found: AtomicU64,
    /// Count of batches that `FindBoundaries` has processed (popped from `q2_reorder`).
    /// Used for completion tracking - `FindBoundaries` only finishes when
    /// `batches_boundary_processed == batches_decompressed`.
    pub batches_boundary_processed: AtomicU64,

    // ========== Queue 2b: FindBoundaries → Decode ==========
    /// Boundary batches waiting to be decoded.
    pub q2b_boundaries: ArrayQueue<(u64, BoundaryBatch)>,

    // ========== Step 4: Decode (parallel) ==========
    /// Flag indicating all decoding is done.
    pub decode_done: AtomicBool,
    /// Count of batches that have completed decoding (for completion tracking).
    pub batches_decoded: AtomicU64,
    /// Configuration for computing `GroupKey` during decode.
    pub group_key_config: GroupKeyConfig,

    // ========== Queue 3: Decode → Group (with reorder) ==========
    /// Decoded records waiting to be grouped.
    pub q3_decoded: ArrayQueue<(u64, Vec<DecodedRecord>)>,
    /// Reorder buffer to ensure step 5 receives records in order.
    pub q3_reorder: Mutex<ReorderBuffer<Vec<DecodedRecord>>>,

    // ========== Q3 Reorder Buffer Atomic State (for lock-free admission control) ==========
    /// Atomic state for Q3 reorder buffer (`next_seq` and `heap_bytes`).
    /// Used by Decode and Group steps for memory backpressure.
    pub q3_reorder_state: ReorderBufferState,
    /// Whether the reorder buffer can currently pop (mirrors `q3_reorder.can_pop()`).
    /// Updated by Group step after inserting/popping from reorder buffer.
    pub q3_reorder_can_pop: AtomicBool,

    // ========== Step 5: Group (exclusive) ==========
    /// Flag indicating all grouping is done.
    pub group_done: AtomicBool,
    /// Next serial number for output groups.
    pub next_group_serial: AtomicU64,
    /// Count of batches that have been processed by Group step (popped from `q3_reorder`).
    /// Used for completion tracking - Group only finishes when `batches_grouped == batches_boundary_found`.
    pub batches_grouped: AtomicU64,

    // ========== Output-Half State (Group → Process → Serialize → Compress → Write) ==========
    /// Shared output pipeline queues and state.
    pub output: OutputPipelineQueues<G, P>,

    // ========== Deadlock Detection ==========
    /// Deadlock detection and recovery state.
    pub deadlock_state: DeadlockState,
}

impl<G: Send, P: Send + MemoryEstimate> BamPipelineState<G, P> {
    /// Create a new pipeline state.
    #[must_use]
    pub fn new(
        config: PipelineConfig,
        input: Box<dyn Read + Send>,
        output: Box<dyn Write + Send>,
        group_key_config: GroupKeyConfig,
    ) -> Self {
        let cap = config.queue_capacity;
        let memory_limit = config.queue_memory_limit;
        let stats = if config.collect_stats {
            config.shared_stats.clone().or_else(|| Some(Arc::new(PipelineStats::new())))
        } else {
            None
        };
        // Create boundary state based on whether header was already read
        let boundary_state = if config.header_already_read {
            BoundaryState::new_no_header()
        } else {
            BoundaryState::new()
        };
        // Create deadlock state from config
        let deadlock_config =
            DeadlockConfig::new(config.deadlock_timeout_secs, config.deadlock_recover_enabled);
        let deadlock_state = DeadlockState::new(&deadlock_config, memory_limit);
        Self {
            config,
            // Step 1: Read
            input_file: Mutex::new(Some(input)),
            read_done: AtomicBool::new(false),
            next_read_serial: AtomicU64::new(0),
            // Q1: Read → Decompress
            q1_raw_blocks: ArrayQueue::new(cap),
            #[cfg(feature = "memory-debug")]
            q1_heap_bytes: AtomicU64::new(0),
            // Step 2: Decompress
            decompress_done: AtomicBool::new(false),
            batches_decompressed: AtomicU64::new(0),
            // Q2: Decompress → FindBoundaries (with reorder)
            q2_decompressed: ArrayQueue::new(cap),
            q2_reorder: Mutex::new(ReorderBuffer::new()),
            // Q2 reorder buffer atomic state (for lock-free admission control)
            // Note: Q2 uses the same memory_limit as Q3 for backpressure
            q2_reorder_state: ReorderBufferState::new(memory_limit),
            // Step 3: FindBoundaries
            boundary_state: Mutex::new(boundary_state),
            boundary_done: AtomicBool::new(false),
            next_boundary_serial: AtomicU64::new(0),
            batches_boundary_found: AtomicU64::new(0),
            batches_boundary_processed: AtomicU64::new(0),
            // Q2b: FindBoundaries → Decode
            q2b_boundaries: ArrayQueue::new(cap),
            // Step 4: Decode
            decode_done: AtomicBool::new(false),
            batches_decoded: AtomicU64::new(0),
            group_key_config,
            // Q3: Decode → Group (with reorder)
            q3_decoded: ArrayQueue::new(cap),
            q3_reorder: Mutex::new(ReorderBuffer::new()),
            // Q3 reorder buffer atomic state (for lock-free admission control)
            q3_reorder_state: ReorderBufferState::new(memory_limit),
            q3_reorder_can_pop: AtomicBool::new(false),
            // Step 5: Group
            group_done: AtomicBool::new(false),
            next_group_serial: AtomicU64::new(0),
            batches_grouped: AtomicU64::new(0),
            // Output-half state (Group → Process → Serialize → Compress → Write)
            output: OutputPipelineQueues::new(
                cap,
                output,
                stats,
                "Processed records",
                memory_limit,
            ),
            // Deadlock detection
            deadlock_state,
        }
    }

    /// Record an error and signal threads to stop.
    pub fn set_error(&self, error: io::Error) {
        self.output.set_error(error);
    }

    /// Check if an error has occurred.
    #[must_use]
    pub fn has_error(&self) -> bool {
        self.output.has_error()
    }

    /// Take the stored error.
    pub fn take_error(&self) -> Option<io::Error> {
        self.output.take_error()
    }

    /// Check if the pipeline is complete.
    #[must_use]
    pub fn is_complete(&self) -> bool {
        // First check atomic flags - all stages must be done
        if !self.read_done.load(Ordering::Acquire) || !self.group_done.load(Ordering::Acquire) {
            return false;
        }

        // Check input-half queues
        if !self.q1_raw_blocks.is_empty()
            || !self.q2_decompressed.is_empty()
            || !self.q2b_boundaries.is_empty()
            || !self.q3_decoded.is_empty()
        {
            return false;
        }

        // Check input-half reorder buffers
        let q2_empty = self.q2_reorder.lock().is_empty();
        let q3_empty = self.q3_reorder.lock().is_empty();
        if !q2_empty || !q3_empty {
            return false;
        }

        // Delegate output-half check
        self.output.are_queues_empty()
    }

    /// Get queue lengths for priority scheduling.
    #[must_use]
    pub fn queue_depths(&self) -> QueueDepths {
        let output_depths = self.output.queue_depths();
        QueueDepths {
            q1: self.q1_raw_blocks.len(),
            q2: self.q2_decompressed.len(),
            q2b: self.q2b_boundaries.len(),
            q3: self.q3_decoded.len(),
            q4: output_depths.groups,
            q5: output_depths.processed,
            q6: output_depths.serialized,
            q7: output_depths.compressed,
        }
    }

    /// Check if Decompress step can proceed with pushing a batch to Q2.
    ///
    /// This implements memory-based backpressure on the Q2 reorder buffer to prevent
    /// unbounded memory growth when `FindBoundaries` (exclusive step) falls behind.
    ///
    /// # Deadlock Prevention
    ///
    /// Always allows the serial that `FindBoundaries` needs (`next_seq`) to proceed,
    /// even if over memory limit. This ensures `FindBoundaries` can always make progress.
    #[must_use]
    pub fn can_decompress_proceed(&self, serial: u64) -> bool {
        // Delegate to Q2 reorder state's can_proceed method
        self.q2_reorder_state.can_proceed(serial)
    }

    /// Check if Decode step can proceed with pushing decoded records to Q3.
    ///
    /// This implements memory-based backpressure on the Q3 reorder buffer to prevent
    /// unbounded memory growth when Group (exclusive step) falls behind.
    ///
    /// # Deadlock Prevention
    ///
    /// Always allows the serial that Group needs (`next_seq`) to proceed, even if
    /// over memory limit. This ensures Group can always make progress.
    #[must_use]
    pub fn can_decode_proceed(&self, serial: u64) -> bool {
        // Delegate to Q3 reorder state's can_proceed method
        self.q3_reorder_state.can_proceed(serial)
    }

    /// Check if memory is at the backpressure threshold.
    ///
    /// Uses Q3 reorder buffer tracking (before Group step) to signal memory pressure
    /// to the scheduler. See [`super::base::BACKPRESSURE_THRESHOLD_BYTES`] for architecture details.
    #[must_use]
    pub fn is_memory_high(&self) -> bool {
        self.q3_reorder_state.is_memory_high()
    }

    /// Check if memory has drained below the low-water mark.
    ///
    /// Provides hysteresis to prevent thrashing: enter drain mode at backpressure
    /// threshold, only exit when drained to half that threshold.
    #[must_use]
    pub fn is_memory_drained(&self) -> bool {
        self.q3_reorder_state.is_memory_drained()
    }

    /// Check if Q5 (processed queue) memory is at the backpressure threshold.
    ///
    /// When true, the Process step should pause to let downstream steps
    /// (Serialize, Compress, Write) drain the queue. This prevents unbounded
    /// memory growth when processing is faster than serialization.
    #[must_use]
    pub fn is_q5_memory_high(&self) -> bool {
        self.output.is_processed_memory_high()
    }

    /// Check if the pipeline is in drain mode (input exhausted, completing remaining work).
    #[must_use]
    pub fn is_draining(&self) -> bool {
        self.output.is_draining()
    }

    /// Get optional reference to pipeline statistics.
    #[must_use]
    pub fn stats(&self) -> Option<&PipelineStats> {
        self.output.stats.as_deref()
    }

    /// Get optional reference to progress tracker.
    #[must_use]
    pub fn progress(&self) -> &ProgressTracker {
        &self.output.progress
    }

    /// Get items written count.
    #[must_use]
    pub fn items_written(&self) -> u64 {
        self.output.items_written.load(Ordering::Relaxed)
    }

    /// Set the draining flag.
    pub fn set_draining(&self, value: bool) {
        self.output.set_draining(value);
    }

    /// Flush the output writer, write the BGZF EOF marker, and finalize.
    ///
    /// # Errors
    ///
    /// Returns an I/O error if writing the BGZF EOF or flushing fails.
    pub fn flush_output(&self) -> io::Result<()> {
        if let Some(mut writer) = self.output.output.lock().take() {
            writer.flush()?;
            writer.write_all(&BGZF_EOF)?;
            writer.flush()?;
        }
        Ok(())
    }

    /// Validate pipeline completion to detect data loss.
    ///
    /// Checks that:
    /// 1. All queues are empty
    /// 2. All batch counters match between stages
    ///
    /// Note: Heap byte tracking is reported but advisory only (set to 0) because
    /// estimation can be imprecise. Only queue emptiness and counter checks
    /// cause validation failure.
    ///
    /// # Errors
    ///
    /// Returns `PipelineValidationError` with diagnostics if any issues are detected.
    pub fn validate_completion(&self) -> Result<(), PipelineValidationError> {
        let mut non_empty_queues = Vec::new();
        let mut counter_mismatches = Vec::new();

        // Check all input-half queues are empty
        if !self.q1_raw_blocks.is_empty() {
            non_empty_queues.push(format!("q1_raw_blocks ({})", self.q1_raw_blocks.len()));
        }
        if !self.q2_decompressed.is_empty() {
            non_empty_queues.push(format!("q2_decompressed ({})", self.q2_decompressed.len()));
        }
        if !self.q2b_boundaries.is_empty() {
            non_empty_queues.push(format!("q2b_boundaries ({})", self.q2b_boundaries.len()));
        }
        if !self.q3_decoded.is_empty() {
            non_empty_queues.push(format!("q3_decoded ({})", self.q3_decoded.len()));
        }

        // Check output-half queues are empty
        if !self.output.groups.is_empty() {
            non_empty_queues.push(format!("q4_groups ({})", self.output.groups.len()));
        }
        if !self.output.processed.is_empty() {
            non_empty_queues.push(format!("q5_processed ({})", self.output.processed.len()));
        }
        if !self.output.serialized.is_empty() {
            non_empty_queues.push(format!("q6_serialized ({})", self.output.serialized.len()));
        }
        if !self.output.compressed.is_empty() {
            non_empty_queues.push(format!("q7_compressed ({})", self.output.compressed.len()));
        }

        // Check reorder buffers are empty
        {
            let q2_reorder = self.q2_reorder.lock();
            if !q2_reorder.is_empty() {
                non_empty_queues.push(format!("q2_reorder ({})", q2_reorder.len()));
            }
        }
        {
            let q3_reorder = self.q3_reorder.lock();
            if !q3_reorder.is_empty() {
                non_empty_queues.push(format!("q3_reorder ({})", q3_reorder.len()));
            }
        }
        {
            let write_reorder = self.output.write_reorder.lock();
            if !write_reorder.is_empty() {
                non_empty_queues.push(format!("write_reorder ({})", write_reorder.len()));
            }
        }

        // Check batch counter invariants
        let total_read = self.next_read_serial.load(Ordering::Acquire);
        let batches_decompressed = self.batches_decompressed.load(Ordering::Acquire);
        let batches_boundary_processed = self.batches_boundary_processed.load(Ordering::Acquire);
        let batches_boundary_found = self.batches_boundary_found.load(Ordering::Acquire);
        let batches_decoded = self.batches_decoded.load(Ordering::Acquire);
        let batches_grouped = self.batches_grouped.load(Ordering::Acquire);

        // Each batch flows through: Read -> Decompress -> FindBoundaries -> Decode -> Group
        if batches_decompressed != total_read {
            counter_mismatches.push(format!(
                "batches_decompressed ({batches_decompressed}) != total_read ({total_read})"
            ));
        }
        if batches_boundary_processed != total_read {
            counter_mismatches.push(format!(
                "batches_boundary_processed ({batches_boundary_processed}) != total_read ({total_read})"
            ));
        }
        // Note: batches_boundary_found may differ from total_read because FindBoundaries
        // can split or combine batches. But batches_decoded should match batches_boundary_found.
        if batches_decoded != batches_boundary_found {
            counter_mismatches.push(format!(
                "batches_decoded ({batches_decoded}) != batches_boundary_found ({batches_boundary_found})"
            ));
        }
        if batches_grouped != batches_boundary_found {
            counter_mismatches.push(format!(
                "batches_grouped ({batches_grouped}) != batches_boundary_found ({batches_boundary_found})"
            ));
        }

        // Note: Heap byte tracking can have small imbalances due to estimation errors,
        // so we don't fail validation on heap bytes. The important checks are queues
        // (actual data) and counters (batch flow).
        let leaked_heap_bytes = 0u64;

        // Return error if any issues found
        if !non_empty_queues.is_empty() || !counter_mismatches.is_empty() {
            return Err(PipelineValidationError {
                non_empty_queues,
                counter_mismatches,
                leaked_heap_bytes,
            });
        }

        Ok(())
    }
}

// ============================================================================
// PipelineLifecycle Trait Implementation
// ============================================================================

impl<G: Send + 'static, P: Send + MemoryEstimate + 'static> PipelineLifecycle
    for BamPipelineState<G, P>
{
    fn is_complete(&self) -> bool {
        BamPipelineState::is_complete(self)
    }

    fn has_error(&self) -> bool {
        BamPipelineState::has_error(self)
    }

    fn take_error(&self) -> Option<io::Error> {
        BamPipelineState::take_error(self)
    }

    fn set_error(&self, error: io::Error) {
        BamPipelineState::set_error(self, error);
    }

    fn is_draining(&self) -> bool {
        BamPipelineState::is_draining(self)
    }

    fn set_draining(&self, value: bool) {
        BamPipelineState::set_draining(self, value);
    }

    fn stats(&self) -> Option<&PipelineStats> {
        BamPipelineState::stats(self)
    }

    fn progress(&self) -> &ProgressTracker {
        BamPipelineState::progress(self)
    }

    fn items_written(&self) -> u64 {
        BamPipelineState::items_written(self)
    }

    fn flush_output(&self) -> io::Result<()> {
        BamPipelineState::flush_output(self)
    }

    fn validate_completion(&self) -> Result<(), PipelineValidationError> {
        BamPipelineState::validate_completion(self)
    }
}

// ============================================================================
// MonitorableState Trait Implementation
// ============================================================================

impl<G: Send + 'static, P: Send + MemoryEstimate + 'static> MonitorableState
    for BamPipelineState<G, P>
{
    fn deadlock_state(&self) -> &DeadlockState {
        &self.deadlock_state
    }

    fn build_queue_snapshot(&self) -> QueueSnapshot {
        // Collect reorder buffer memory (requires locks)
        let q2_reorder_mem = {
            let reorder = self.q2_reorder.lock();
            reorder.total_heap_size() as u64
        };
        let q3_reorder_mem = {
            let reorder = self.q3_reorder.lock();
            reorder.total_heap_size() as u64
        };

        QueueSnapshot {
            q1_len: self.q1_raw_blocks.len(),
            q2_len: self.q2_decompressed.len(),
            q2b_len: self.q2b_boundaries.len(),
            q3_len: self.q3_decoded.len(),
            q4_len: self.output.groups.len(),
            q5_len: self.output.processed.len(),
            q6_len: self.output.serialized.len(),
            q7_len: self.output.compressed.len(),
            q2_reorder_mem,
            q3_reorder_mem,
            memory_limit: self.deadlock_state.get_memory_limit(),
            read_done: self.read_done.load(Ordering::Relaxed),
            group_done: self.group_done.load(Ordering::Relaxed),
            draining: self.output.draining.load(Ordering::Relaxed),
            extra_state: None,
        }
    }
}

impl<G: Send + 'static, P: Send + MemoryEstimate + 'static> OutputPipelineState
    for BamPipelineState<G, P>
{
    type Processed = P;

    fn has_error(&self) -> bool {
        self.output.has_error()
    }

    fn set_error(&self, error: io::Error) {
        self.output.set_error(error);
    }

    fn q5_pop(&self) -> Option<(u64, SerializedBatch)> {
        self.output.serialized.pop()
    }

    fn q5_push(&self, item: (u64, SerializedBatch)) -> Result<(), (u64, SerializedBatch)> {
        self.output.serialized.push(item)
    }

    fn q5_is_full(&self) -> bool {
        self.output.serialized.is_full()
    }

    fn q5_track_pop(&self, heap_size: u64) {
        self.output.serialized_heap_bytes.fetch_sub(heap_size, Ordering::AcqRel);
    }

    fn q6_pop(&self) -> Option<(u64, CompressedBlockBatch)> {
        self.output.compressed.pop()
    }

    fn q6_push(
        &self,
        item: (u64, CompressedBlockBatch),
    ) -> Result<(), (u64, CompressedBlockBatch)> {
        let heap_size = item.1.estimate_heap_size();
        let result = self.output.compressed.push(item);
        if result.is_ok() {
            self.output.compressed_heap_bytes.fetch_add(heap_size as u64, Ordering::AcqRel);
        }
        result
    }

    fn q6_is_full(&self) -> bool {
        self.output.compressed.is_full()
    }

    fn q6_track_pop(&self, heap_size: u64) {
        self.output.compressed_heap_bytes.fetch_sub(heap_size, Ordering::AcqRel);
    }

    fn q6_reorder_insert(&self, serial: u64, batch: CompressedBlockBatch) {
        self.output.write_reorder.lock().insert(serial, batch);
    }

    fn q6_reorder_try_pop_next(&self) -> Option<CompressedBlockBatch> {
        self.output.write_reorder.lock().try_pop_next()
    }

    fn output_try_lock(
        &self,
    ) -> Option<parking_lot::MutexGuard<'_, Option<Box<dyn Write + Send>>>> {
        self.output.output.try_lock()
    }

    fn increment_written(&self) -> u64 {
        self.output.items_written.fetch_add(1, Ordering::Release)
    }

    fn record_compressed_bytes_out(&self, bytes: u64) {
        if let Some(ref stats) = self.output.stats {
            stats.compressed_bytes_out.fetch_add(bytes, Ordering::Relaxed);
        }
    }

    fn record_q6_pop_progress(&self) {
        self.deadlock_state.record_q6_pop();
    }

    fn record_q7_push_progress(&self) {
        self.deadlock_state.record_q7_push();
    }

    fn write_reorder_can_proceed(&self, serial: u64) -> bool {
        self.output.write_reorder_state.can_proceed(serial)
    }

    fn write_reorder_is_memory_high(&self) -> bool {
        self.output.write_reorder_state.is_memory_high()
    }

    fn stats(&self) -> Option<&PipelineStats> {
        self.output.stats.as_deref()
    }
}

// ============================================================================
// New Shared Traits (Phase 2 - Pipeline Consolidation)
// ============================================================================

impl<G: Send + MemoryEstimate + 'static, P: Send + MemoryEstimate + 'static>
    ProcessPipelineState<G, P> for BamPipelineState<G, P>
{
    fn process_input_pop(&self) -> Option<(u64, Vec<G>)> {
        let result = self.output.groups.pop();
        if result.is_some() {
            // Q4 memory-debug tracking is handled by try_step_process (which does its own
            // direct pop). Do NOT track here to avoid double-subtraction.
            self.deadlock_state.record_q4_pop();
        }
        result
    }

    fn process_output_is_full(&self) -> bool {
        self.output.processed.is_full()
    }

    fn process_output_push(&self, item: (u64, Vec<P>)) -> Result<(), (u64, Vec<P>)> {
        // Calculate heap size before push for memory tracking
        let heap_size: usize = item.1.iter().map(MemoryEstimate::estimate_heap_size).sum();
        let result = self.output.processed.push(item);
        if result.is_ok() {
            self.output.processed_heap_bytes.fetch_add(heap_size as u64, Ordering::AcqRel);
            self.deadlock_state.record_q5_push();
        }
        result
    }

    fn has_error(&self) -> bool {
        self.output.has_error()
    }

    fn set_error(&self, error: io::Error) {
        self.output.set_error(error);
    }

    fn should_apply_process_backpressure(&self) -> bool {
        self.output.should_apply_process_backpressure()
    }

    fn is_draining(&self) -> bool {
        self.output.is_draining()
    }
}

impl<G: Send + 'static, P: Send + MemoryEstimate + 'static> SerializePipelineState<P>
    for BamPipelineState<G, P>
{
    fn serialize_input_pop(&self) -> Option<(u64, Vec<P>)> {
        let result = self.output.processed.pop();
        if let Some((_, ref batch)) = result {
            // Track memory being removed from processed queue
            let heap_size: usize = batch.iter().map(MemoryEstimate::estimate_heap_size).sum();
            self.output.processed_heap_bytes.fetch_sub(heap_size as u64, Ordering::AcqRel);
            self.deadlock_state.record_q5_pop();
        }
        result
    }

    fn serialize_output_is_full(&self) -> bool {
        self.output.serialized.is_full()
    }

    fn serialize_output_push(
        &self,
        item: (u64, SerializedBatch),
    ) -> Result<(), (u64, SerializedBatch)> {
        let heap_size = item.1.estimate_heap_size();
        let result = self.output.serialized.push(item);
        if result.is_ok() {
            self.output.serialized_heap_bytes.fetch_add(heap_size as u64, Ordering::AcqRel);
            self.deadlock_state.record_q6_push();
        }
        result
    }

    fn has_error(&self) -> bool {
        self.output.has_error()
    }

    fn set_error(&self, error: io::Error) {
        self.output.set_error(error);
    }

    fn record_serialized_bytes(&self, bytes: u64) {
        if let Some(ref stats) = self.output.stats {
            stats.serialized_bytes.fetch_add(bytes, Ordering::Relaxed);
        }
    }
}

impl<G: Send + 'static, P: Send + MemoryEstimate + 'static> WritePipelineState
    for BamPipelineState<G, P>
{
    fn write_input_queue(&self) -> &ArrayQueue<(u64, CompressedBlockBatch)> {
        &self.output.compressed
    }

    fn write_reorder_buffer(&self) -> &Mutex<ReorderBuffer<CompressedBlockBatch>> {
        &self.output.write_reorder
    }

    fn write_reorder_state(&self) -> &super::base::ReorderBufferState {
        &self.output.write_reorder_state
    }

    fn write_output(&self) -> &Mutex<Option<Box<dyn Write + Send>>> {
        &self.output.output
    }

    fn has_error(&self) -> bool {
        self.output.has_error()
    }

    fn set_error(&self, error: io::Error) {
        self.output.set_error(error);
    }

    fn record_written(&self, count: u64) {
        self.output.items_written.fetch_add(count, Ordering::Release);
    }

    fn stats(&self) -> Option<&PipelineStats> {
        self.output.stats.as_deref()
    }
}

/// Snapshot of queue depths for priority scheduling.
#[derive(Debug, Clone, Copy)]
pub struct QueueDepths {
    pub q1: usize,
    pub q2: usize,
    pub q2b: usize,
    pub q3: usize,
    pub q4: usize,
    pub q5: usize,
    pub q6: usize,
    pub q7: usize,
}

impl QueueDepths {
    /// Check if a step has input available in its input queue.
    /// Returns true if the step might have work, false if input queue is definitely empty.
    #[inline]
    #[must_use]
    pub fn has_input_for_step(&self, step: PipelineStep) -> bool {
        match step {
            PipelineStep::Read => true, // Read always can try (reads from file, not queue)
            PipelineStep::Decompress => self.q1 > 0,
            PipelineStep::FindBoundaries => self.q2 > 0,
            PipelineStep::Decode => self.q2b > 0,
            PipelineStep::Group => self.q3 > 0,
            PipelineStep::Process => self.q4 > 0,
            PipelineStep::Serialize => self.q5 > 0,
            PipelineStep::Compress => self.q6 > 0,
            PipelineStep::Write => self.q7 > 0,
        }
    }
}

// ============================================================================
// Grouper Trait (for Step 5: Group)
// ============================================================================

/// Trait for command-specific record grouping logic.
///
/// The Grouper receives already-decoded BAM records and groups them according
/// to command-specific rules. Since records are pre-decoded, grouping is very
/// fast - just comparing record names, positions, or tags.
///
/// Different commands use different grouping strategies:
/// - `group`: Groups by genomic position
/// - `simplex/duplex/codec`: Groups by MI tag
/// - `filter/clip/correct`: No grouping (each record is its own "group")
pub trait Grouper: Send {
    /// The type of group produced by this grouper.
    type Group: Send;

    /// Add decoded records to the grouper.
    ///
    /// Records are guaranteed to be in order (from template-coordinate sorted BAM).
    /// The grouper maintains partial groups waiting for more records.
    ///
    /// Each `DecodedRecord` contains the record plus a pre-computed `GroupKey`
    /// for fast comparison (position, name hash, library, etc.).
    ///
    /// Returns completed groups (may be empty if more records are needed).
    ///
    /// # Errors
    ///
    /// Returns an I/O error if grouping logic encounters invalid data.
    fn add_records(&mut self, records: Vec<DecodedRecord>) -> io::Result<Vec<Self::Group>>;

    /// Signal that no more input will arrive (EOF).
    ///
    /// Returns any remaining partial group.
    ///
    /// # Errors
    ///
    /// Returns an I/O error if finalizing the grouper fails.
    fn finish(&mut self) -> io::Result<Option<Self::Group>>;

    /// Returns true if the grouper has a partial group.
    fn has_pending(&self) -> bool;
}

/// State for the exclusive `Group` step.
///
/// This is held under a mutex and accessed by whichever thread
/// is currently executing the `Group` step.
pub struct GroupState<G: Send> {
    /// The grouper instance that performs grouping of decoded records.
    pub grouper: Box<dyn Grouper<Group = G> + Send>,
    /// Flag indicating EOF has been signaled to the grouper.
    finished: bool,
    /// Groups waiting to be pushed to Q4 (backpressure buffer).
    pending_groups: VecDeque<G>,
    /// Accumulated weight (total templates) of pending groups.
    /// Used for template-based batching.
    pending_weight: usize,
}

impl<G: Send> GroupState<G> {
    /// Create a new state with the given grouper.
    #[must_use]
    pub fn new(grouper: Box<dyn Grouper<Group = G> + Send>) -> Self {
        Self { grouper, finished: false, pending_groups: VecDeque::new(), pending_weight: 0 }
    }

    /// Check if there are pending groups waiting to be pushed to Q4.
    #[must_use]
    pub fn has_pending_output(&self) -> bool {
        !self.pending_groups.is_empty()
    }

    /// Process decoded records and return completed groups.
    ///
    /// # Errors
    ///
    /// Returns an I/O error if grouping fails.
    pub fn process(&mut self, records: Vec<DecodedRecord>) -> io::Result<Vec<G>> {
        self.grouper.add_records(records)
    }

    /// Signal EOF and get any remaining group.
    ///
    /// # Errors
    ///
    /// Returns an I/O error if the grouper fails to finalize.
    pub fn finish(&mut self) -> io::Result<Option<G>> {
        if self.finished {
            return Ok(None);
        }
        self.finished = true;
        self.grouper.finish()
    }

    /// Check if EOF has been signaled.
    #[must_use]
    pub fn is_finished(&self) -> bool {
        self.finished
    }

    /// Check if grouper has pending data.
    #[must_use]
    pub fn has_pending(&self) -> bool {
        self.grouper.has_pending()
    }
}

// ============================================================================
// Step Functions Trait (for BAM pipeline)
// ============================================================================

/// Functions provided by the command for each pipeline step.
///
/// Generic parameters:
/// - `G`: Group type (output of `DeserializeGroup`, input to Process)
/// - `P`: Processed type (output of Process, input to Serialize)
#[allow(clippy::type_complexity)]
pub struct PipelineFunctions<G: Send, P: Send> {
    /// Step 5: Process a group. Called in parallel by multiple threads.
    /// Returns `io::Result` for proper error propagation.
    pub process_fn: Box<dyn Fn(G) -> io::Result<P> + Send + Sync>,

    /// Step 6: Serialize processed output to a provided buffer.
    /// Appends serialized BAM bytes to the buffer and returns the record count.
    /// This enables buffer reuse in single-threaded mode to avoid allocations.
    pub serialize_fn: Box<dyn Fn(P, &mut Vec<u8>) -> io::Result<u64> + Send + Sync>,

    /// Optional secondary serialization (e.g., rejected records).
    /// Called with a borrow of P BEFORE the primary `serialize_fn` consumes it.
    pub secondary_serialize_fn:
        Option<Box<dyn Fn(&P, &mut Vec<u8>) -> io::Result<u64> + Send + Sync>>,
}

impl<G: Send, P: Send> PipelineFunctions<G, P> {
    /// Create new step functions.
    pub fn new<ProcessFn, SerializeFn>(process_fn: ProcessFn, serialize_fn: SerializeFn) -> Self
    where
        ProcessFn: Fn(G) -> io::Result<P> + Send + Sync + 'static,
        SerializeFn: Fn(P, &mut Vec<u8>) -> io::Result<u64> + Send + Sync + 'static,
    {
        Self {
            process_fn: Box::new(process_fn),
            serialize_fn: Box::new(serialize_fn),
            secondary_serialize_fn: None,
        }
    }

    /// Attach a secondary serialize function for dual-output pipelines.
    #[must_use]
    pub fn with_secondary_serialize<F>(mut self, f: F) -> Self
    where
        F: Fn(&P, &mut Vec<u8>) -> io::Result<u64> + Send + Sync + 'static,
    {
        self.secondary_serialize_fn = Some(Box::new(f));
        self
    }
}

// ============================================================================
// Per-Thread Worker State
// ============================================================================

/// Default buffer capacities based on `SingleThreadedBuffers` patterns.
const DECOMPRESSION_BUFFER_CAPACITY: usize = 256 * 1024; // 256KB (4 blocks × 64KB)
const SERIALIZATION_BUFFER_CAPACITY: usize = 64 * 1024; // 64KB (typical group size)

/// Per-thread state for parallel steps.
///
/// Each worker thread has its own compressor, decompressor, scheduler,
/// reusable buffers, and held items for non-blocking pipeline execution.
///
/// # Held Items for Deadlock Prevention
///
/// Each held_* field stores an item that couldn't be pushed to the next queue.
/// Instead of blocking forever (which causes deadlock when all threads block),
/// workers hold the item and return immediately, allowing them to try other
/// steps (especially Write to drain the pipeline).
pub struct WorkerState<P: Send> {
    /// Core worker state (compressor, scheduler, serialization buffer, backoff).
    pub core: WorkerCoreState,
    /// Decompressor for step 2.
    pub decompressor: libdeflater::Decompressor,
    /// Reusable buffer for decompression (Step 2).
    /// Swapped out each batch to avoid per-batch allocation.
    pub decompression_buffer: Vec<u8>,

    // ==================== Held Items for Non-Blocking Steps ====================
    /// Held raw blocks from Read step (couldn't push to `q1_raw_blocks`).
    pub held_raw: Option<(u64, RawBlockBatch)>,
    /// Held decompressed data from Decompress step (couldn't push to `q2_decompressed`).
    /// Includes `heap_size` for memory tracking.
    pub held_decompressed: Option<(u64, DecompressedBatch, usize)>,
    /// Held boundaries from `FindBoundaries` step (couldn't push to `q2b_boundaries`).
    pub held_boundaries: Option<(u64, BoundaryBatch)>,
    /// Held decoded records from Decode step (couldn't push to `q3_decoded`).
    /// Includes `heap_size` for memory tracking - held items have their memory
    /// released from tracking and must re-reserve when retrying.
    pub held_decoded: Option<(u64, Vec<DecodedRecord>, usize)>,
    /// Held processed results from Process step (couldn't push to `q5_processed`).
    /// Includes `heap_size` for memory tracking.
    pub held_processed: Option<(u64, Vec<P>, usize)>,
    /// Held serialized batch from Serialize step (couldn't push to `q6_serialized`).
    /// Includes `heap_size` for memory tracking.
    pub held_serialized: Option<(u64, SerializedBatch, usize)>,
    /// Held compressed batch from Compress step (couldn't push to `q7_compressed`).
    /// Includes `heap_size` for memory tracking.
    pub held_compressed: Option<(u64, CompressedBlockBatch, usize)>,
}

impl<P: Send> WorkerState<P> {
    /// Create new worker state with the given compression level, thread info, and scheduler strategy.
    #[must_use]
    pub fn new(
        compression_level: u32,
        thread_id: usize,
        num_threads: usize,
        scheduler_strategy: SchedulerStrategy,
    ) -> Self {
        Self {
            core: WorkerCoreState::new(
                compression_level,
                thread_id,
                num_threads,
                scheduler_strategy,
                ActiveSteps::all(),
            ),
            decompressor: libdeflater::Decompressor::new(),
            decompression_buffer: Vec::with_capacity(DECOMPRESSION_BUFFER_CAPACITY),
            // Initialize all held items to None
            held_raw: None,
            held_decompressed: None,
            held_boundaries: None,
            held_decoded: None,
            held_processed: None,
            held_serialized: None,
            held_compressed: None,
        }
    }

    /// Returns true if any held item fields are Some.
    ///
    /// Used to check if a worker still has pending work before completion.
    #[inline]
    #[must_use]
    pub fn has_any_held_items(&self) -> bool {
        self.held_raw.is_some()
            || self.held_decompressed.is_some()
            || self.held_boundaries.is_some()
            || self.held_decoded.is_some()
            || self.held_processed.is_some()
            || self.held_serialized.is_some()
            || self.held_compressed.is_some()
    }

    /// Clear all held items (for cleanup/error handling).
    pub fn clear_held_items(&mut self) {
        self.held_raw = None;
        self.held_decompressed = None;
        self.held_boundaries = None;
        self.held_decoded = None;
        self.held_processed = None;
        self.held_serialized = None;
        self.held_compressed = None;
    }
}

impl<P: Send> HasCompressor for WorkerState<P> {
    fn compressor_mut(&mut self) -> &mut InlineBgzfCompressor {
        &mut self.core.compressor
    }
}

impl<P: Send> HasRecycledBuffers for WorkerState<P> {
    fn take_or_alloc_buffer(&mut self, capacity: usize) -> Vec<u8> {
        self.core.take_or_alloc_buffer(capacity)
    }

    fn recycle_buffer(&mut self, buf: Vec<u8>) {
        self.core.recycle_buffer(buf);
    }
}

impl<P: Send> HasHeldCompressed for WorkerState<P> {
    fn held_compressed_mut(&mut self) -> &mut Option<(u64, CompressedBlockBatch, usize)> {
        &mut self.held_compressed
    }
}

impl<P: Send> HasHeldBoundaries<BoundaryBatch> for WorkerState<P> {
    fn held_boundaries_mut(&mut self) -> &mut Option<(u64, BoundaryBatch)> {
        &mut self.held_boundaries
    }
}

impl<P: Send> HasHeldProcessed<P> for WorkerState<P> {
    fn held_processed_mut(&mut self) -> &mut Option<(u64, Vec<P>, usize)> {
        &mut self.held_processed
    }
}

impl<P: Send> HasHeldSerialized for WorkerState<P> {
    fn held_serialized_mut(&mut self) -> &mut Option<(u64, SerializedBatch, usize)> {
        &mut self.held_serialized
    }

    fn serialization_buffer_mut(&mut self) -> &mut Vec<u8> {
        &mut self.core.serialization_buffer
    }

    fn serialization_buffer_capacity(&self) -> usize {
        SERIALIZATION_BUFFER_CAPACITY // 64KB for BAM
    }
}

impl<P: Send> WorkerStateCommon for WorkerState<P> {
    fn has_any_held_items(&self) -> bool {
        WorkerState::has_any_held_items(self)
    }

    fn clear_held_items(&mut self) {
        WorkerState::clear_held_items(self);
    }
}

impl<P: Send> HasWorkerCore for WorkerState<P> {
    fn core(&self) -> &WorkerCoreState {
        &self.core
    }

    fn core_mut(&mut self) -> &mut WorkerCoreState {
        &mut self.core
    }
}

// ============================================================================
// Step Execution Functions
// ============================================================================

/// Try to execute Step 1: Read raw BGZF blocks.
///
/// Returns true if work was done.
///
/// # Non-Blocking Design
///
/// Uses the held-item pattern to prevent deadlock. If the output queue is full,
/// the batch is stored in `worker.held_raw` and the function returns immediately.
/// This allows the worker to try other steps (especially Write) to drain the pipeline.
fn try_step_read<G: Send, P: Send + MemoryEstimate>(
    state: &BamPipelineState<G, P>,
    worker: &mut WorkerState<P>,
) -> bool {
    // =========================================================================
    // Priority 1: Try to advance any held raw batch first
    // =========================================================================
    if let Some((serial, held)) = worker.held_raw.take() {
        #[cfg(feature = "memory-debug")]
        let q1_bytes = held.total_compressed_size() as u64;
        match state.q1_raw_blocks.push((serial, held)) {
            Ok(()) => {
                // Successfully advanced held item, continue to read more
                #[cfg(feature = "memory-debug")]
                state.q1_heap_bytes.fetch_add(q1_bytes, Ordering::Relaxed);
                state.deadlock_state.record_q1_push();
            }
            Err((serial, held)) => {
                // Still can't push - put it back and signal output full
                worker.held_raw = Some((serial, held));
                return false;
            }
        }
    }

    // =========================================================================
    // Priority 2: Skip if reading is done
    // =========================================================================
    if state.read_done.load(Ordering::Relaxed) {
        return false;
    }

    // =========================================================================
    // Priority 3: Check if output queue has space (soft check)
    // =========================================================================
    if state.q1_raw_blocks.len() >= state.config.queue_capacity {
        return false;
    }

    // =========================================================================
    // Priority 4: Try to acquire exclusive access to input file
    // =========================================================================
    let Some(mut guard) = state.input_file.try_lock() else {
        // Record contention for diagnostics
        if let Some(stats) = state.stats() {
            stats.record_contention(PipelineStep::Read);
        }
        return false; // Another thread is reading
    };

    let Some(ref mut reader) = *guard else {
        return false; // File already closed
    };

    // =========================================================================
    // Priority 5: Read a batch of raw BGZF blocks
    // =========================================================================
    // Read FIRST, then assign serial - ensures we don't waste serial numbers on EOF
    match read_raw_blocks(reader.as_mut(), state.config.blocks_per_read_batch) {
        Ok(blocks) if blocks.is_empty() => {
            // EOF - no data read, don't increment serial
            state.read_done.store(true, Ordering::SeqCst);
            false
        }
        Ok(blocks) => {
            // Data was read, now assign serial number
            let serial = state.next_read_serial.fetch_add(1, Ordering::SeqCst);
            let batch = RawBlockBatch { blocks };

            // Record bytes read for throughput metrics
            if let Some(stats) = state.stats() {
                stats.bytes_read.fetch_add(batch.total_compressed_size() as u64, Ordering::Relaxed);
            }

            // =========================================================================
            // Priority 6: Try to push result (non-blocking)
            // =========================================================================
            #[cfg(feature = "memory-debug")]
            let q1_bytes = batch.total_compressed_size() as u64;
            match state.q1_raw_blocks.push((serial, batch)) {
                Ok(()) => {
                    #[cfg(feature = "memory-debug")]
                    state.q1_heap_bytes.fetch_add(q1_bytes, Ordering::Relaxed);
                    state.deadlock_state.record_q1_push();
                    true
                }
                Err((serial, batch)) => {
                    // Output full - hold the result for next attempt
                    worker.held_raw = Some((serial, batch));
                    false
                }
            }
        }
        Err(e) => {
            state.set_error(e);
            false
        }
    }
}

/// Try to execute Step 2: Decompress blocks.
///
/// This step is parallel - multiple threads can decompress concurrently.
///
/// # Non-Blocking Design
///
/// Uses the held-item pattern to prevent deadlock. If the output queue is full,
/// the batch is stored in `worker.held_decompressed` and the function returns
/// immediately. Held batches push unconditionally at Priority 1 (physical
/// capacity only — no memory check). Memory backpressure is applied between
/// P1 and P3 to gate new work when the reorder buffer is large.
fn try_step_decompress<G: Send, P: Send + MemoryEstimate>(
    state: &BamPipelineState<G, P>,
    worker: &mut WorkerState<P>,
) -> bool {
    // =========================================================================
    // Priority 1: Try to advance any held decompressed batch first
    // =========================================================================
    // Push unconditionally — only physical queue capacity can block.
    // Memory backpressure is NOT checked here: the held batch is already in
    // memory (just untracked while held), so pushing it doesn't increase actual
    // usage. Checking can_proceed here caused deadlocks when all workers held
    // non-next_seq batches and nobody could produce next_seq.
    let mut advanced_held = false;
    if let Some((serial, held, heap_size)) = worker.held_decompressed.take() {
        state.q2_reorder_state.add_heap_bytes(heap_size as u64);
        match state.q2_decompressed.push((serial, held)) {
            Ok(()) => {
                state.batches_decompressed.fetch_add(1, Ordering::Release);
                state.deadlock_state.record_q2_push();
                advanced_held = true;
            }
            Err((serial, held)) => {
                state.q2_reorder_state.sub_heap_bytes(heap_size as u64);
                worker.held_decompressed = Some((serial, held, heap_size));
                return false;
            }
        }
    }

    // =========================================================================
    // Priority 2: Check backpressure (physical capacity AND memory)
    // =========================================================================
    // Memory check gates new work when the Q2 reorder buffer is large.
    // This is safe from deadlock: Q1 is FIFO, so `next_seq` for q2_reorder
    // has already been decompressed (it's in q2_decompressed, q2_reorder,
    // or a held slot pushed at P1). FindBoundaries continues draining →
    // memory drops → workers resume. Same pattern as Process step's
    // `should_apply_process_backpressure()`.
    if state.q2_decompressed.is_full() || state.q2_reorder_state.is_memory_high() {
        return advanced_held;
    }

    // =========================================================================
    // Priority 3: Pop input and process
    // =========================================================================
    let Some((serial, raw_batch)) = state.q1_raw_blocks.pop() else {
        if let Some(stats) = state.stats() {
            stats.record_queue_empty(1);
        }
        return advanced_held;
    };
    // Track Q1 memory on pop
    #[cfg(feature = "memory-debug")]
    {
        let q1_pop_bytes = raw_batch.total_compressed_size() as u64;
        state.q1_heap_bytes.fetch_sub(q1_pop_bytes, Ordering::Relaxed);
    }
    state.deadlock_state.record_q1_pop();

    // Prepare worker's buffer: clear and reserve capacity
    worker.decompression_buffer.clear();
    let expected_size = raw_batch.total_uncompressed_size();
    worker.decompression_buffer.reserve(expected_size);

    // Decompress directly into worker's buffer (no intermediate allocations)
    for block in &raw_batch.blocks {
        if let Err(e) =
            decompress_block_into(block, &mut worker.decompressor, &mut worker.decompression_buffer)
        {
            state.set_error(e);
            return false;
        }
    }

    // Swap buffer into batch, replace with fresh pre-allocated buffer
    let decompressed = std::mem::replace(
        &mut worker.decompression_buffer,
        Vec::with_capacity(DECOMPRESSION_BUFFER_CAPACITY),
    );

    // Record compression metrics
    if let Some(stats) = state.stats() {
        stats
            .compressed_bytes_in
            .fetch_add(raw_batch.total_compressed_size() as u64, Ordering::Relaxed);
        stats.decompressed_bytes.fetch_add(decompressed.len() as u64, Ordering::Relaxed);
    }

    // =========================================================================
    // Priority 4: Calculate and reserve memory for tracking
    // =========================================================================
    let batch = DecompressedBatch { data: decompressed };
    let heap_size = batch.estimate_heap_size();
    state.q2_reorder_state.add_heap_bytes(heap_size as u64);

    // =========================================================================
    // Priority 5: Try to push result
    // =========================================================================
    match state.q2_decompressed.push((serial, batch)) {
        Ok(()) => {
            state.batches_decompressed.fetch_add(1, Ordering::Release);
            state.deadlock_state.record_q2_push();
            true
        }
        Err((serial, batch)) => {
            // Output full - release reservation and hold
            state.q2_reorder_state.sub_heap_bytes(heap_size as u64);
            worker.held_decompressed = Some((serial, batch, heap_size));
            false
        }
    }
}

/// Try to execute Step 3: Find record boundaries in decompressed data.
///
/// SYNC WITH: fastq.rs `fastq_try_step_find_boundaries()`
/// Both implementations use the "held boundaries" pattern for parallelism.
/// See base.rs `HasHeldBoundaries` trait for pattern documentation.
///
/// This step is exclusive but FAST (~0.1μs per block) - only scans 4-byte integers.
/// Processes multiple batches per lock acquisition to reduce contention.
/// Result of attempting an exclusive step.
/// Returns (`did_work`, `was_contention`).
///
/// # Non-Blocking Design
///
/// Uses the held-item pattern to prevent deadlock. If the output queue is full,
/// the batch is stored in `worker.held_boundaries` and the function returns immediately.
#[allow(clippy::too_many_lines)]
fn try_step_find_boundaries<G: Send, P: Send + MemoryEstimate>(
    state: &BamPipelineState<G, P>,
    worker: &mut WorkerState<P>,
) -> (bool, bool) {
    const MAX_BATCHES_PER_LOCK: usize = 8;

    // =========================================================================
    // Priority 1: Try to advance any held boundary batch first
    // =========================================================================
    let mut did_work = false;
    if let Some((serial, held)) = worker.held_boundaries.take() {
        match state.q2b_boundaries.push((serial, held)) {
            Ok(()) => {
                // Successfully advanced held item, increment completion counter
                state.batches_boundary_found.fetch_add(1, Ordering::Release);
                state.deadlock_state.record_q2b_push();
                did_work = true;
            }
            Err((serial, held)) => {
                // Still can't push - put it back and signal backpressure
                worker.held_boundaries = Some((serial, held));
                return (false, false); // Backpressure, not contention
            }
        }
    }

    // =========================================================================
    // Priority 2: Check if output queue has space
    // =========================================================================
    if state.q2b_boundaries.is_full() {
        return (false, false); // Backpressure, not contention
    }

    // =========================================================================
    // Priority 3: Try to acquire exclusive access to boundary state
    // =========================================================================
    let Some(mut boundary_guard) = state.boundary_state.try_lock() else {
        if let Some(stats) = state.stats() {
            stats.record_contention(PipelineStep::FindBoundaries);
        }
        return (did_work, true); // Contention (but may have advanced held item)
    };

    // Process multiple batches per lock acquisition to reduce contention
    for _ in 0..MAX_BATCHES_PER_LOCK {
        // Check if output queue still has space
        if state.q2b_boundaries.is_full() {
            break;
        }

        // Drain Q2 into reorder buffer AND get next in-order batch
        let batch_with_size = {
            let mut reorder = state.q2_reorder.lock();

            // Insert all pending decompressed batches into reorder buffer.
            // Memory was already reserved by Decompress - just insert for ordering.
            while let Some((serial, batch)) = state.q2_decompressed.pop() {
                state.deadlock_state.record_q2_pop();
                let heap_size = batch.estimate_heap_size();
                reorder.insert_with_size(serial, batch, heap_size);
            }

            // Try to pop the next in-order batch
            let result = reorder.try_pop_next_with_size();

            // Update next_seq atomic for Decompress's backpressure check
            state.q2_reorder_state.update_next_seq(reorder.next_seq());

            result
        };

        // Release memory from atomic tracker when popping from reorder buffer
        let Some((batch, heap_size)) = batch_with_size else {
            if !did_work {
                if let Some(stats) = state.stats() {
                    stats.record_queue_empty(2);
                }
            }
            break; // No more data available
        };
        state.q2_reorder_state.sub_heap_bytes(heap_size as u64);

        // Track that we've processed a batch from q2 (for completion tracking)
        state.batches_boundary_processed.fetch_add(1, Ordering::Release);

        // Find boundaries in the decompressed data
        match boundary_guard.find_boundaries(&batch.data) {
            Ok(boundary_batch) => {
                // Only push if there are records
                if boundary_batch.offsets.len() > 1 {
                    // Record batch size for statistics
                    let num_records = boundary_batch.offsets.len() - 1;
                    if let Some(stats) = state.stats() {
                        stats.record_batch_size(num_records);
                    }

                    let serial = state.next_boundary_serial.fetch_add(1, Ordering::SeqCst);
                    // Try non-blocking push
                    match state.q2b_boundaries.push((serial, boundary_batch)) {
                        Ok(()) => {
                            // Successfully pushed, increment completion counter
                            state.batches_boundary_found.fetch_add(1, Ordering::Release);
                            state.deadlock_state.record_q2b_push();
                        }
                        Err((serial, boundary_batch)) => {
                            // Output full - hold the result and stop processing
                            worker.held_boundaries = Some((serial, boundary_batch));
                            return (true, false); // Did work (processed data), will retry push later
                        }
                    }
                }
                did_work = true;
            }
            Err(e) => {
                state.set_error(e);
                return (false, false);
            }
        }
    }

    if did_work {
        return (true, false); // Success, no contention (we held the lock)
    }

    // No batches processed - check if we should finish
    // Completion check: Only finish when THIS step has processed all input batches.
    // We check batches_boundary_processed == total_read directly, which implies that
    // Decompress has also finished (since we can't process more than was decompressed).
    // This prevents a race where FindBoundaries sets boundary_done while data is still
    // in q2_decompressed waiting to be processed.
    let read_done = state.read_done.load(Ordering::Acquire);
    let total_read = state.next_read_serial.load(Ordering::Acquire);
    let batches_boundary_processed = state.batches_boundary_processed.load(Ordering::Acquire);

    if read_done
        && batches_boundary_processed == total_read
        && !state.boundary_done.load(Ordering::Acquire)
    {
        // All input processed - finish and emit any remaining boundaries
        match boundary_guard.finish() {
            Ok(Some(final_batch)) => {
                if final_batch.offsets.len() > 1 {
                    let serial = state.next_boundary_serial.fetch_add(1, Ordering::SeqCst);
                    // Try non-blocking push for final batch
                    match state.q2b_boundaries.push((serial, final_batch)) {
                        Ok(()) => {
                            // Successfully pushed, increment completion counter
                            state.batches_boundary_found.fetch_add(1, Ordering::Release);
                            state.deadlock_state.record_q2b_push();
                        }
                        Err((serial, final_batch)) => {
                            // Hold final batch for next attempt
                            worker.held_boundaries = Some((serial, final_batch));
                            return (true, false);
                        }
                    }
                }
                state.boundary_done.store(true, Ordering::SeqCst);
                (true, false)
            }
            Ok(None) => {
                state.boundary_done.store(true, Ordering::SeqCst);
                (false, false)
            }
            Err(e) => {
                state.set_error(e);
                (false, false)
            }
        }
    } else {
        (false, false) // No work available, no contention
    }
}

/// Try to execute Step 4: Decode BAM records at known boundaries.
///
/// This step is parallel - multiple threads can decode concurrently.
///
/// # Non-Blocking Design
///
/// Uses the held-item pattern to prevent deadlock. If the output queue is full,
/// the batch is stored in `worker.held_decoded` and the function returns
/// immediately. Held batches push unconditionally at Priority 1 (physical
/// capacity only — no memory check). Memory backpressure is applied between
/// P1 and P3 to gate new work when the reorder buffer is large.
fn try_step_decode<G: Send, P: Send + MemoryEstimate>(
    state: &BamPipelineState<G, P>,
    worker: &mut WorkerState<P>,
) -> bool {
    // =========================================================================
    // Priority 1: Try to advance any held decoded batch first
    // =========================================================================
    // Push unconditionally — only physical queue capacity can block.
    // See Decompress Priority 1 for rationale.
    let mut advanced_held = false;
    if let Some((serial, held, heap_size)) = worker.held_decoded.take() {
        state.q3_reorder_state.add_heap_bytes(heap_size as u64);
        match state.q3_decoded.push((serial, held)) {
            Ok(()) => {
                state.batches_decoded.fetch_add(1, Ordering::Release);
                state.deadlock_state.record_q3_push();
                advanced_held = true;
            }
            Err((serial, held)) => {
                state.q3_reorder_state.sub_heap_bytes(heap_size as u64);
                worker.held_decoded = Some((serial, held, heap_size));
                return false;
            }
        }
    }

    // =========================================================================
    // Priority 2: Check backpressure (physical capacity AND memory)
    // =========================================================================
    // Memory check gates new work when the Q3 reorder buffer is large.
    // Same deadlock-safe pattern as Decompress P2 — see comment there.
    if state.q3_decoded.is_full() || state.q3_reorder_state.is_memory_high() {
        return advanced_held;
    }

    // =========================================================================
    // Priority 3: Pop input
    // =========================================================================
    let Some((serial, boundary_batch)) = state.q2b_boundaries.pop() else {
        if let Some(stats) = state.stats() {
            stats.record_queue_empty(25); // Q2b (boundaries queue)
        }
        // Check if boundary finding is done and queue is empty
        if state.boundary_done.load(Ordering::SeqCst) && state.q2b_boundaries.is_empty() {
            state.decode_done.store(true, Ordering::SeqCst);
        } else if let Some(stats) = state.stats() {
            // Q2b is extension of Q2
            stats.record_queue_empty(2);
        }
        return advanced_held;
    };
    state.deadlock_state.record_q2b_pop();

    // =========================================================================
    // Priority 4: Decode records and compute GroupKey
    // =========================================================================
    match decode_records(&boundary_batch, &state.group_key_config) {
        Ok(records) => {
            // Record decoded count for throughput metrics
            if let Some(stats) = state.stats() {
                stats.records_decoded.fetch_add(records.len() as u64, Ordering::Relaxed);
            }

            // Calculate and reserve memory for tracking
            let heap_size = records.estimate_heap_size();
            state.q3_reorder_state.add_heap_bytes(heap_size as u64);

            // =========================================================================
            // Priority 5: Try to push result
            // =========================================================================
            match state.q3_decoded.push((serial, records)) {
                Ok(()) => {
                    state.batches_decoded.fetch_add(1, Ordering::Release);
                    state.deadlock_state.record_q3_push();
                    true
                }
                Err((serial, records)) => {
                    // Output full - release reservation and hold
                    state.q3_reorder_state.sub_heap_bytes(heap_size as u64);
                    worker.held_decoded = Some((serial, records, heap_size));
                    false
                }
            }
        }
        Err(e) => {
            state.set_error(e);
            false
        }
    }
}

/// Try to execute Step 5: Group decoded records.
///
/// This step is exclusive - only one thread at a time.
/// Groups are accumulated and pushed to Q4 as batches for compression efficiency.
///
/// Batching mode depends on configuration:
/// - `target_templates_per_batch > 0`: Weight-based batching using `BatchWeight::batch_weight()`
/// - `target_templates_per_batch == 0`: Count-based batching using `batch_size`
#[allow(clippy::too_many_lines)]
fn try_step_group<G: Send + BatchWeight + MemoryEstimate + 'static, P: Send + MemoryEstimate>(
    state: &BamPipelineState<G, P>,
    group_state: &Mutex<GroupState<G>>,
) -> (bool, bool) {
    const MAX_BATCHES_PER_LOCK: usize = 8;
    const MAX_PENDING_DRAIN: usize = 16;

    // Try to acquire exclusive access to group state
    let Some(mut guard) = group_state.try_lock() else {
        if let Some(stats) = state.stats() {
            stats.record_contention(PipelineStep::Group);
        }
        return (false, true); // Contention!
    };

    let batch_size = state.config.batch_size;
    let target_weight = state.config.target_templates_per_batch;
    let use_weight_batching = target_weight > 0;

    // Helper to check if we should flush the pending batch
    let should_flush = |pending_len: usize, pending_weight: usize| -> bool {
        if use_weight_batching {
            pending_weight >= target_weight
        } else {
            pending_len >= batch_size
        }
    };

    // Helper to push a batch of groups to Q4
    // Returns Ok(()) on success, Err(batch) on failure so caller can restore
    // Note: We don't track Q4 memory here - only Q3 (decoded records) is tracked.
    // Mixing different estimation methods (O(1) for Q3, estimate_heap_size for Q4)
    // on the same tracker causes imbalance and deadlock.
    let push_batch = |groups: Vec<G>, state: &BamPipelineState<G, P>| -> Result<(), Vec<G>> {
        if state.output.groups.is_full() {
            return Err(groups);
        }

        // Track Q4 memory (debug only — estimate_heap_size is O(templates))
        #[cfg(feature = "memory-debug")]
        {
            let heap_size: u64 = groups.iter().map(|g| g.estimate_heap_size() as u64).sum();
            state.output.groups_heap_bytes.fetch_add(heap_size, Ordering::AcqRel);
        }

        let serial = state.next_group_serial.fetch_add(1, Ordering::SeqCst);
        state
            .output
            .groups
            .push((serial, groups))
            .unwrap_or_else(|_| panic!("groups push failed after is_full check"));
        state.deadlock_state.record_q4_push();
        Ok(())
    };

    // Helper to flush all pending groups and reset weight
    let flush_all = |guard: &mut GroupState<G>, state: &BamPipelineState<G, P>| -> Option<bool> {
        if guard.pending_groups.is_empty() {
            return Some(true);
        }
        if state.output.groups.is_full() {
            return None; // Backpressure
        }
        let batch: Vec<G> = guard.pending_groups.drain(..).collect();
        guard.pending_weight = 0;
        match push_batch(batch, state) {
            Ok(()) => Some(true),
            Err(batch) => {
                // Restore the batch on failure (race condition)
                for group in batch.into_iter().rev() {
                    guard.pending_weight += group.batch_weight();
                    guard.pending_groups.push_front(group);
                }
                None
            }
        }
    };

    // First, try to drain pending_groups to Q4 as batches
    while should_flush(guard.pending_groups.len(), guard.pending_weight) {
        if state.output.groups.is_full() {
            return (false, false); // Queue backpressure, not contention
        }
        // For weight-based batching, flush all pending groups at once
        // For count-based batching, drain exactly batch_size
        if use_weight_batching {
            let batch: Vec<G> = guard.pending_groups.drain(..).collect();
            guard.pending_weight = 0;
            if let Err(batch) = push_batch(batch, state) {
                // Restore the batch on race condition failure
                for group in batch.into_iter().rev() {
                    guard.pending_weight += group.batch_weight();
                    guard.pending_groups.push_front(group);
                }
                return (false, false);
            }
            break; // Only one flush per check for weight-based
        }
        let batch: Vec<G> = guard.pending_groups.drain(..batch_size).collect();
        if let Err(batch) = push_batch(batch, state) {
            // Restore the batch on race condition failure
            for group in batch.into_iter().rev() {
                guard.pending_weight += group.batch_weight();
                guard.pending_groups.push_front(group);
            }
            return (false, false); // Backpressure
        }
    }

    // If finish() was called and all pending groups have been drained
    if guard.is_finished() && !state.group_done.load(Ordering::SeqCst) {
        // Try to flush remaining groups non-blocking. If Q4 is full, return
        // and let the scheduler retry — the thread can do useful Process work
        // in between to drain Q4. The deadlock detector handles true deadlocks.
        if flush_all(&mut guard, state).is_some() {
            state.group_done.store(true, Ordering::SeqCst);
            return (true, false);
        }
        return (false, false); // Q4 full, scheduler will retry
    }

    // Process multiple record batches per lock acquisition to reduce contention
    let mut did_work = false;

    // Reusable buffer for pre-draining (allocated once per try_step_group call)
    let mut pending: Vec<(u64, Vec<DecodedRecord>, usize)> = Vec::with_capacity(MAX_PENDING_DRAIN);

    for _ in 0..MAX_BATCHES_PER_LOCK {
        // Pre-drain q3_decoded BEFORE taking the reorder lock (lock-free operations)
        // This reduces critical section time by moving ArrayQueue ops outside the lock
        pending.clear();
        while pending.len() < MAX_PENDING_DRAIN {
            if let Some((serial, batch)) = state.q3_decoded.pop() {
                state.deadlock_state.record_q3_pop();
                let heap_size = batch.estimate_heap_size();
                pending.push((serial, batch, heap_size));
            } else {
                break;
            }
        }

        // Now take the reorder lock and insert all pending batches
        let records = {
            let mut reorder = state.q3_reorder.lock();

            // Insert all pre-drained batches into reorder buffer
            for (serial, batch, heap_size) in pending.drain(..) {
                reorder.insert_with_size(serial, batch, heap_size);
            }

            // Try to pop the next in-order batch
            let result = reorder.try_pop_next_with_size();

            // Update can_pop and next_seq atomics for Decode's backpressure check
            state.q3_reorder_state.update_next_seq(reorder.next_seq());
            state.q3_reorder_can_pop.store(reorder.can_pop(), Ordering::Release);

            result
        };

        // Release memory from atomic tracker when popping from reorder buffer
        let Some((records, heap_size)) = records else {
            if !did_work {
                if let Some(stats) = state.stats() {
                    stats.record_queue_empty(3);
                }
            }
            break; // No more data available
        };
        state.q3_reorder_state.sub_heap_bytes(heap_size as u64);

        // Track that we've processed a batch from q3 (for completion tracking)
        state.batches_grouped.fetch_add(1, Ordering::Release);

        // Process the decoded records
        match guard.process(records) {
            Ok(groups) => {
                // Record groups produced for throughput metrics
                if let Some(stats) = state.stats() {
                    stats.groups_produced.fetch_add(groups.len() as u64, Ordering::Relaxed);
                }

                // Add groups and track weight
                for group in groups {
                    guard.pending_weight += group.batch_weight();
                    guard.pending_groups.push_back(group);
                }

                // Push batches when threshold is reached
                while should_flush(guard.pending_groups.len(), guard.pending_weight) {
                    if state.output.groups.is_full() {
                        return (true, false); // Did work, stopping due to backpressure
                    }
                    if use_weight_batching {
                        let batch: Vec<G> = guard.pending_groups.drain(..).collect();
                        guard.pending_weight = 0;
                        if let Err(batch) = push_batch(batch, state) {
                            // Restore the batch on race condition failure
                            for group in batch.into_iter().rev() {
                                guard.pending_weight += group.batch_weight();
                                guard.pending_groups.push_front(group);
                            }
                            return (true, false); // Memory limit race - backpressure
                        }
                        break;
                    }
                    let batch: Vec<G> = guard.pending_groups.drain(..batch_size).collect();
                    if let Err(batch) = push_batch(batch, state) {
                        // Restore the batch on race condition failure
                        for group in batch.into_iter().rev() {
                            guard.pending_weight += group.batch_weight();
                            guard.pending_groups.push_front(group);
                        }
                        return (true, false); // Memory limit race - backpressure
                    }
                }
                did_work = true;
            }
            Err(e) => {
                state.set_error(e);
                return (false, false);
            }
        }
    }

    if did_work {
        return (true, false); // Success
    }

    // No records processed - check if we should finish
    if guard.is_finished() {
        return (false, false); // Already finished
    }

    // Use atomic counters for completion check (not reorder buffer next_seq)
    // This avoids TOCTOU races because counters are incremented atomically after push.
    // CRITICAL: We use batches_grouped (batches processed by Group), NOT batches_decoded
    // (batches pushed to q3). Using batches_decoded caused a race where Group would finish
    // before actually processing all the data in q3.
    let boundary_done = state.boundary_done.load(Ordering::Acquire);
    let total_boundary_batches = state.batches_boundary_found.load(Ordering::Acquire);
    let batches_grouped = state.batches_grouped.load(Ordering::Acquire);

    if boundary_done && batches_grouped == total_boundary_batches {
        // All input processed - finish and emit any remaining group
        match guard.finish() {
            Ok(Some(group)) => {
                guard.pending_weight += group.batch_weight();
                guard.pending_groups.push_back(group);

                // Flush remaining batches
                while should_flush(guard.pending_groups.len(), guard.pending_weight) {
                    if state.output.groups.is_full() {
                        return (true, false); // Did work, backpressure
                    }
                    if use_weight_batching {
                        let batch: Vec<G> = guard.pending_groups.drain(..).collect();
                        guard.pending_weight = 0;
                        if let Err(batch) = push_batch(batch, state) {
                            // Restore the batch on race condition failure
                            for group in batch.into_iter().rev() {
                                guard.pending_weight += group.batch_weight();
                                guard.pending_groups.push_front(group);
                            }
                            return (true, false); // backpressure
                        }
                        break;
                    }
                    let batch: Vec<G> = guard.pending_groups.drain(..batch_size).collect();
                    if let Err(batch) = push_batch(batch, state) {
                        // Restore the batch on race condition failure
                        for group in batch.into_iter().rev() {
                            guard.pending_weight += group.batch_weight();
                            guard.pending_groups.push_front(group);
                        }
                        return (true, false); // backpressure
                    }
                }

                // Flush any remaining groups non-blocking. If Q4 is full,
                // return and let the scheduler retry via is_finished() check.
                if flush_all(&mut guard, state).is_some() {
                    state.group_done.store(true, Ordering::SeqCst);
                }
                (true, false) // Did work (finished grouper)
            }
            Ok(None) => {
                // No final group — flush any pending. If Q4 is full, return
                // and let the scheduler retry via is_finished() check.
                if flush_all(&mut guard, state).is_some() {
                    state.group_done.store(true, Ordering::SeqCst);
                }
                (false, false) // Finished, no new work
            }
            Err(e) => {
                state.set_error(e);
                (false, false)
            }
        }
    } else {
        (false, false) // No work available
    }
}

/// Try to execute Step 6: Process groups.
///
/// This step is parallel - multiple threads can process concurrently.
/// Receives a batch of groups from Q4, processes each, and pushes
/// the batch of processed results to Q5.
///
/// # Non-Blocking Design
///
/// Uses the held-item pattern to prevent deadlock. If the output queue is full,
/// the batch is stored in `worker.held_processed` and the function returns immediately.
///
/// # Memory Backpressure
///
/// This function checks both queue capacity AND memory pressure before doing new work.
/// The backpressure check happens AFTER advancing any held item, which matches the
/// baseline behavior and prevents memory spikes at high thread counts.
fn try_step_process<G: Send + MemoryEstimate + 'static, P: Send + MemoryEstimate + 'static>(
    state: &BamPipelineState<G, P>,
    fns: &PipelineFunctions<G, P>,
    worker: &mut WorkerState<P>,
) -> bool {
    const MAX_BATCHES: usize = 8;

    // =========================================================================
    // Priority 1: Try to advance any held processed batch first
    // =========================================================================
    if let Some((serial, held, heap_size)) = worker.held_processed.take() {
        match state.output.processed.push((serial, held)) {
            Ok(()) => {
                // Successfully advanced held item - memory tracking handled by trait impl
                state.output.processed_heap_bytes.fetch_add(heap_size as u64, Ordering::AcqRel);
                state.deadlock_state.record_q5_push();
            }
            Err((serial, held)) => {
                // Still can't push - put it back and signal output full
                worker.held_processed = Some((serial, held, heap_size));
                return false;
            }
        }
    }

    // =========================================================================
    // Priority 2: Check if output queue has space (count and memory)
    // Memory backpressure is always enforced (including during draining) to
    // prevent OOM.  The slot-based is_full() check is sufficient to guarantee
    // forward progress: Serialize drains Q5 → slots free → Process resumes.
    // =========================================================================
    if state.output.processed.is_full() || state.is_q5_memory_high() {
        return false;
    }

    // =========================================================================
    // Priority 3: Pop and process batches
    // Always drain multiple batches when work is available for better throughput.
    // Q5 memory backpressure above prevents unbounded growth.
    // =========================================================================
    let mut did_work = false;

    for _ in 0..MAX_BATCHES {
        // Check output space (count and memory) before each batch
        if state.output.processed.is_full() || state.is_q5_memory_high() {
            break;
        }

        let Some((serial, batch)) = state.output.groups.pop() else {
            if let Some(stats) = state.stats() {
                stats.record_queue_empty(4);
            }
            break;
        };
        state.deadlock_state.record_q4_pop();

        // Track Q4 memory decrement (debug only — estimate_heap_size is O(templates))
        #[cfg(feature = "memory-debug")]
        {
            let q4_heap: u64 = batch.iter().map(|g| g.estimate_heap_size() as u64).sum();
            state.output.groups_heap_bytes.fetch_sub(q4_heap, Ordering::AcqRel);
        }

        // Process each group in the batch
        let mut results: Vec<P> = Vec::with_capacity(batch.len());
        for group in batch {
            match (fns.process_fn)(group) {
                Ok(processed) => results.push(processed),
                Err(e) => {
                    state.set_error(e);
                    return false;
                }
            }
        }

        // Calculate heap size for memory tracking
        let heap_size: usize = results.iter().map(MemoryEstimate::estimate_heap_size).sum();

        // Try to push result
        match state.output.processed.push((serial, results)) {
            Ok(()) => {
                state.output.processed_heap_bytes.fetch_add(heap_size as u64, Ordering::AcqRel);
                state.deadlock_state.record_q5_push();
                did_work = true;
            }
            Err((serial, results)) => {
                // Output full - hold the result for next attempt
                worker.held_processed = Some((serial, results, heap_size));
                break;
            }
        }
    }

    did_work
}

/// Try to execute Step 7: Serialize records.
///
/// This step is parallel - multiple threads can serialize concurrently.
/// Receives a batch of processed items from Q5, serializes each, and
/// concatenates all serialized data into a single `SerializedBatch`.
///
/// # Non-Blocking Design
///
/// Uses the held-item pattern to prevent deadlock. If the output queue is full,
/// the batch is stored in `worker.held_serialized` and the function returns immediately.
fn try_step_serialize<G: Send + 'static, P: Send + MemoryEstimate + 'static>(
    state: &BamPipelineState<G, P>,
    fns: &PipelineFunctions<G, P>,
    worker: &mut WorkerState<P>,
) -> bool {
    // =========================================================================
    // Priority 1: Try to advance any held serialized batch first
    // =========================================================================
    if let Some((serial, held, heap_size)) = worker.held_serialized.take() {
        match state.output.serialized.push((serial, held)) {
            Ok(()) => {
                // Successfully advanced held item
                state.output.serialized_heap_bytes.fetch_add(heap_size as u64, Ordering::AcqRel);
                state.deadlock_state.record_q6_push();
            }
            Err((serial, held)) => {
                // Still can't push - put it back and signal output full
                worker.held_serialized = Some((serial, held, heap_size));
                return false;
            }
        }
    }

    // =========================================================================
    // Priority 2: Check if output queue has space
    // =========================================================================
    if state.output.serialized.is_full() {
        return false;
    }

    // =========================================================================
    // Priority 3: Pop input
    // =========================================================================
    let Some((serial, batch)) = state.output.processed.pop() else {
        if let Some(stats) = state.stats() {
            stats.record_queue_empty(5);
        }
        return false;
    };
    state.deadlock_state.record_q5_pop();

    // Track memory being removed from Q5
    let q5_heap_size: usize = batch.iter().map(MemoryEstimate::estimate_heap_size).sum();
    state.output.processed_heap_bytes.fetch_sub(q5_heap_size as u64, Ordering::AcqRel);

    // =========================================================================
    // Priority 4: Serialize all items
    // =========================================================================
    // Prepare worker's serialization buffers
    worker.core.serialization_buffer.clear();
    worker.core.secondary_serialization_buffer.clear();

    // Serialize all items into worker's buffer
    let mut total_record_count: u64 = 0;
    for item in batch {
        // Secondary serialize (borrows item) — must run before primary consumes it
        if let Some(ref secondary_fn) = fns.secondary_serialize_fn {
            if let Err(e) = (secondary_fn)(&item, &mut worker.core.secondary_serialization_buffer) {
                state.set_error(e);
                return false;
            }
        }
        // Primary serialize (consumes item)
        match (fns.serialize_fn)(item, &mut worker.core.serialization_buffer) {
            Ok(record_count) => {
                total_record_count += record_count;
            }
            Err(e) => {
                state.set_error(e);
                return false;
            }
        }
    }

    // Swap buffer into batch, replace with fresh pre-allocated buffer
    let combined_data = std::mem::replace(
        &mut worker.core.serialization_buffer,
        Vec::with_capacity(SERIALIZATION_BUFFER_CAPACITY),
    );

    // Build secondary data if any was serialized
    let secondary_data = if worker.core.secondary_serialization_buffer.is_empty() {
        None
    } else {
        Some(std::mem::replace(
            &mut worker.core.secondary_serialization_buffer,
            Vec::with_capacity(SERIALIZATION_BUFFER_CAPACITY),
        ))
    };

    // Record serialized bytes for throughput metrics
    if let Some(stats) = state.stats() {
        stats.serialized_bytes.fetch_add(combined_data.len() as u64, Ordering::Relaxed);
    }

    // =========================================================================
    // Priority 5: Try to push result (non-blocking)
    // =========================================================================
    let batch =
        SerializedBatch { data: combined_data, record_count: total_record_count, secondary_data };
    let heap_size = batch.estimate_heap_size();
    match state.output.serialized.push((serial, batch)) {
        Ok(()) => {
            state.output.serialized_heap_bytes.fetch_add(heap_size as u64, Ordering::AcqRel);
            state.deadlock_state.record_q6_push();
            true
        }
        Err((serial, batch)) => {
            // Output full - hold the result for next attempt
            worker.held_serialized = Some((serial, batch, heap_size));
            false
        }
    }
}

/// Try to execute Step 8: Compress to BGZF blocks.
///
/// This step is parallel - multiple threads can compress concurrently.
/// Delegates to the shared implementation which uses the held-item pattern.
fn try_step_compress<G: Send + 'static, P: Send + MemoryEstimate + 'static>(
    state: &BamPipelineState<G, P>,
    worker: &mut WorkerState<P>,
) -> bool {
    shared_try_step_compress(state, worker).is_success()
}

/// Try to execute Step 9: Write blocks to output.
///
/// This step is exclusive - only one thread at a time.
fn try_step_write<G: Send + 'static, P: Send + MemoryEstimate + 'static>(
    state: &BamPipelineState<G, P>,
) -> (bool, bool) {
    // Try to acquire exclusive access to output file FIRST
    // This avoids wasting time draining if we can't get the lock
    let Some(mut guard) = state.output.output.try_lock() else {
        // Record contention for diagnostics
        if let Some(stats) = state.stats() {
            stats.record_contention(PipelineStep::Write);
        }
        return (false, true); // Contention!
    };

    let Some(ref mut writer) = *guard else {
        return (false, false); // File already closed, not contention
    };

    // Drain Q7 into reorder buffer AND write all ready batches in single lock scope
    let mut wrote_any = false;
    let q7_truly_empty;
    {
        let mut reorder = state.output.write_reorder.lock();

        // Drain Q7 into reorder buffer, tracking heap bytes for admission control.
        while let Some((serial, batch)) = state.output.compressed.pop() {
            let q7_heap = batch.estimate_heap_size() as u64;
            state.q6_track_pop(q7_heap);
            state.deadlock_state.record_q7_pop();
            let heap_size = batch.estimate_heap_size();
            reorder.insert_with_size(serial, batch, heap_size);
            state.output.write_reorder_state.add_heap_bytes(heap_size as u64);
        }

        // Write all ready batches
        while let Some((batch, heap_size)) = reorder.try_pop_next_with_size() {
            // Write all blocks in the batch
            let mut batch_bytes: u64 = 0;
            for block in &batch.blocks {
                if let Err(e) = writer.write_all(&block.data) {
                    state.set_error(e);
                    return (false, false); // Error, not contention
                }
                batch_bytes += block.data.len() as u64;
            }

            // Write secondary data (e.g., rejects) in the same serial order
            if let Some(ref secondary_data) = batch.secondary_data {
                if !secondary_data.is_empty() {
                    if let Some(ref secondary_mutex) = state.output.secondary_output {
                        let mut sw_guard = secondary_mutex.lock();
                        if let Some(ref mut sw) = *sw_guard {
                            if let Err(e) = sw.write_raw_bytes(secondary_data) {
                                state.set_error(e);
                                return (false, false);
                            }
                        }
                    }
                }
            }

            // Update admission-control state so the scheduler-level
            // write_reorder_is_memory_high signal stays accurate. (Compress
            // does not consume this gate — see WritePipelineState docs.)
            state.output.write_reorder_state.sub_heap_bytes(heap_size as u64);
            state.output.write_reorder_state.update_next_seq(reorder.next_seq());

            // Record bytes written for throughput metrics
            if let Some(stats) = state.stats() {
                stats.bytes_written.fetch_add(batch_bytes, Ordering::Relaxed);
            }

            // Use actual record count from the batch
            let records_in_batch = batch.record_count;
            state.output.items_written.fetch_add(records_in_batch, Ordering::Relaxed);
            state.output.progress.log_if_needed(records_in_batch);
            wrote_any = true;
        }

        // Check if truly empty (queue drained and reorder buffer has no pending items)
        q7_truly_empty = reorder.is_empty();
    }

    // Record queue empty only if both Q7 queue AND reorder buffer are empty
    // (not when items are waiting out-of-order in the reorder buffer)
    if !wrote_any && q7_truly_empty {
        if let Some(stats) = state.stats() {
            stats.record_queue_empty(7);
        }
    }

    (wrote_any, false) // Success or no work, no contention (we held lock)
}

// ============================================================================
// Step Context (for consolidated generic_worker_loop)
// ============================================================================

/// Context for BAM pipeline step execution.
///
/// This struct holds references to all the state needed to execute pipeline steps,
/// and implements `StepContext` to work with `generic_worker_loop`.
pub struct BamStepContext<'a, G: Send, P: Send + MemoryEstimate> {
    pub state: &'a BamPipelineState<G, P>,
    pub group_state: &'a Mutex<GroupState<G>>,
    pub fns: &'a PipelineFunctions<G, P>,
    pub is_reader: bool,
}

impl<G, P> StepContext for BamStepContext<'_, G, P>
where
    G: Send + BatchWeight + MemoryEstimate + 'static,
    P: Send + MemoryEstimate + 'static,
{
    type Worker = WorkerState<P>;

    fn execute_step(&self, worker: &mut Self::Worker, step: PipelineStep) -> (bool, bool) {
        execute_step(self.state, self.group_state, self.fns, worker, step)
    }

    fn get_backpressure(&self, _worker: &Self::Worker) -> BackpressureState {
        let depths = self.state.queue_depths();
        let read_done = self.state.read_done.load(Ordering::Relaxed);
        BackpressureState {
            output_high: depths.q7 > self.state.config.output_high_water,
            input_low: depths.q1 < self.state.config.input_low_water,
            read_done,
            memory_high: !self.state.is_draining() && self.state.is_memory_high(),
            memory_drained: self.state.is_memory_drained(),
        }
    }

    fn check_drain_mode(&self) {
        let read_done = self.state.read_done.load(Ordering::Relaxed);
        if read_done && self.state.q1_raw_blocks.is_empty() {
            self.state.output.draining.store(true, Ordering::Relaxed);
        }
    }

    fn has_error(&self) -> bool {
        self.state.has_error()
    }

    fn is_complete(&self) -> bool {
        self.state.is_complete()
    }

    fn stats(&self) -> Option<&PipelineStats> {
        self.state.stats()
    }

    fn skip_read(&self) -> bool {
        // Always skip Read in priority loop:
        // - Readers handle reading via sticky read before the priority loop
        // - Workers don't read at all
        true
    }

    fn check_completion_at_end(&self) -> bool {
        true // Original BAM behavior: check completion at end of loop
    }

    fn should_attempt_sticky_read(&self) -> bool {
        // Outer guard: skip entirely when read_done (original BAM optimization)
        self.is_reader && !self.state.read_done.load(Ordering::Relaxed)
    }

    fn sticky_read_should_continue(&self) -> bool {
        // Full condition checked each iteration
        !self.state.has_error()
            && !self.state.read_done.load(Ordering::Relaxed)
            && self.state.q1_raw_blocks.len() < self.state.config.queue_capacity
    }

    fn execute_read_step(&self, worker: &mut Self::Worker) -> bool {
        try_step_read(self.state, worker)
    }

    fn is_drain_mode(&self) -> bool {
        let read_done = self.state.read_done.load(Ordering::Relaxed);
        let group_done = self.state.group_done.load(Ordering::Relaxed);
        read_done && group_done
    }

    fn should_attempt_step(
        &self,
        worker: &Self::Worker,
        step: PipelineStep,
        drain_mode: bool,
    ) -> bool {
        worker.core.scheduler.should_attempt_step_with_drain(step, drain_mode)
    }

    fn exclusive_step_owned(&self, worker: &Self::Worker) -> Option<PipelineStep> {
        if self.is_reader {
            // Reader thread doesn't use the "try owned first" pattern
            // (it has sticky read instead)
            None
        } else {
            worker.core.scheduler.exclusive_step_owned()
        }
    }
}

/// Execute a single pipeline step, returning (success, `was_contention`).
///
/// `was_contention` indicates if failure was due to lock contention (true)
/// or due to queue being full/empty (false). Only contention failures should
/// be recorded for Thompson Sampling updates.
fn execute_step<
    G: Send + BatchWeight + MemoryEstimate + 'static,
    P: Send + MemoryEstimate + 'static,
>(
    state: &BamPipelineState<G, P>,
    group_state: &Mutex<GroupState<G>>,
    fns: &PipelineFunctions<G, P>,
    worker: &mut WorkerState<P>,
    step: PipelineStep,
) -> (bool, bool) {
    match step {
        PipelineStep::Read => (false, false), // Never read from worker threads
        PipelineStep::Decompress => (try_step_decompress(state, worker), false),
        PipelineStep::FindBoundaries => try_step_find_boundaries(state, worker),
        PipelineStep::Decode => (try_step_decode(state, worker), false),
        PipelineStep::Group => try_step_group(state, group_state),
        PipelineStep::Process => (try_step_process(state, fns, worker), false),
        PipelineStep::Serialize => (try_step_serialize(state, fns, worker), false),
        PipelineStep::Compress => (try_step_compress(state, worker), false),
        PipelineStep::Write => try_step_write(state),
    }
}

// ============================================================================
// Single-Threaded Fast Path
// ============================================================================

/// Pre-allocated buffers for single-threaded pipeline execution.
///
/// These buffers are created once and reused each iteration to avoid
/// per-iteration allocation overhead.
struct SingleThreadedBuffers {
    /// Buffer for concatenated decompressed BGZF block data.
    /// Cleared and reused each iteration.
    decompressed: Vec<u8>,
    /// Buffer for serialized BAM record data.
    /// Cleared and reused each group to avoid allocation.
    serialized: Vec<u8>,
    /// Buffer for secondary serialized data (e.g., rejected records).
    /// Only used when a secondary serialize function is set.
    secondary: Vec<u8>,
}

impl SingleThreadedBuffers {
    /// Create new buffers with reasonable initial capacity.
    fn new() -> Self {
        Self {
            // 4 blocks * 64KB max uncompressed = 256KB typical
            decompressed: Vec::with_capacity(256 * 1024),
            // Typical group serializes to ~64KB
            serialized: Vec::with_capacity(64 * 1024),
            secondary: Vec::new(),
        }
    }
}

/// Run the BAM pipeline in single-threaded mode.
///
/// This avoids the overhead of thread spawning, queues, and atomic
/// operations when only one thread is requested. Significantly faster
/// for small inputs or when parallelization overhead exceeds the benefit.
#[allow(clippy::needless_pass_by_value)]
fn run_bam_pipeline_single_threaded<G, P>(
    config: &PipelineConfig,
    mut input: Box<dyn Read + Send>,
    mut output: Box<dyn Write + Send>,
    mut grouper: Box<dyn Grouper<Group = G> + Send>,
    fns: PipelineFunctions<G, P>,
    group_key_config: GroupKeyConfig,
    mut secondary_writer: Option<crate::bam_io::RawBamWriter>,
) -> io::Result<u64>
where
    G: Send + 'static,
    P: Send + MemoryEstimate + 'static,
{
    // Step 1+2: Reader and decompressor
    let mut decompressor = libdeflater::Decompressor::new();

    // Step 3: Boundary finder state (skip header if already read)
    let mut boundary_state = if config.header_already_read {
        BoundaryState::new_no_header()
    } else {
        BoundaryState::new()
    };

    // Step 8: Compressor
    let mut compressor = InlineBgzfCompressor::new(config.compression_level);

    // Pre-allocated reusable buffers
    let mut buffers = SingleThreadedBuffers::new();

    // Progress tracking
    let progress = ProgressTracker::new("Processed records").with_interval(PROGRESS_LOG_INTERVAL);

    // Main loop: read -> decompress -> find_boundaries -> decode -> group -> process -> serialize -> compress -> write
    loop {
        // Step 1: Read a batch of raw BGZF blocks
        let blocks = read_raw_blocks(input.as_mut(), 4)?; // Read 4 blocks at a time
        if blocks.is_empty() {
            break; // EOF
        }

        // Clear decompression buffer for reuse (keeps capacity)
        buffers.decompressed.clear();

        // Step 2: Decompress all blocks into reusable buffer
        let expected_size: usize =
            blocks.iter().map(super::super::bgzf_reader::RawBgzfBlock::uncompressed_size).sum();
        buffers.decompressed.reserve(expected_size);

        for block in &blocks {
            decompress_block_into(block, &mut decompressor, &mut buffers.decompressed)?;
        }

        // Step 3: Find record boundaries
        let boundary_batch = boundary_state.find_boundaries(&buffers.decompressed)?;

        // Step 4: Decode records (only if there are any)
        if boundary_batch.offsets.len() > 1 {
            let decoded = decode_records(&boundary_batch, &group_key_config)?;

            // Step 5: Feed decoded records to grouper
            let groups = grouper.add_records(decoded)?;

            // Process each group through steps 6-9
            for group in groups {
                // Step 6: Process
                let processed = (fns.process_fn)(group)?;

                // Step 7a: Secondary serialize (borrows processed)
                buffers.secondary.clear();
                if let Some(ref secondary_fn) = fns.secondary_serialize_fn {
                    (secondary_fn)(&processed, &mut buffers.secondary)?;
                }

                // Step 7b: Primary serialize (consumes processed, reuse buffer)
                buffers.serialized.clear();
                let record_count = (fns.serialize_fn)(processed, &mut buffers.serialized)?;

                // Step 8: Compress (only when buffer reaches 64KB)
                compressor.write_all(&buffers.serialized)?;
                compressor.maybe_compress()?;

                // Step 9: Write any completed blocks to output
                compressor.write_blocks_to(output.as_mut())?;

                // Write secondary data
                if !buffers.secondary.is_empty() {
                    if let Some(ref mut sw) = secondary_writer {
                        sw.write_raw_bytes(&buffers.secondary)?;
                    }
                }

                progress.log_if_needed(record_count);
            }
        }
    }

    // Handle any remaining bytes from boundary finding
    if let Some(final_batch) = boundary_state.finish()? {
        if final_batch.offsets.len() > 1 {
            let decoded = decode_records(&final_batch, &group_key_config)?;
            let groups = grouper.add_records(decoded)?;

            for group in groups {
                let processed = (fns.process_fn)(group)?;

                buffers.secondary.clear();
                if let Some(ref secondary_fn) = fns.secondary_serialize_fn {
                    (secondary_fn)(&processed, &mut buffers.secondary)?;
                }

                buffers.serialized.clear();
                let record_count = (fns.serialize_fn)(processed, &mut buffers.serialized)?;
                compressor.write_all(&buffers.serialized)?;
                compressor.maybe_compress()?;
                compressor.write_blocks_to(output.as_mut())?;

                if !buffers.secondary.is_empty() {
                    if let Some(ref mut sw) = secondary_writer {
                        sw.write_raw_bytes(&buffers.secondary)?;
                    }
                }

                progress.log_if_needed(record_count);
            }
        }
    }

    // Finish grouper - process any remaining partial group
    if let Some(final_group) = grouper.finish()? {
        // Step 6: Process
        let processed = (fns.process_fn)(final_group)?;

        // Step 7a: Secondary serialize (borrows processed)
        buffers.secondary.clear();
        if let Some(ref secondary_fn) = fns.secondary_serialize_fn {
            (secondary_fn)(&processed, &mut buffers.secondary)?;
        }

        // Step 7b: Primary serialize (consumes processed, reuse buffer)
        buffers.serialized.clear();
        let record_count = (fns.serialize_fn)(processed, &mut buffers.serialized)?;

        // Step 8: Compress (only when buffer reaches 64KB)
        compressor.write_all(&buffers.serialized)?;
        compressor.maybe_compress()?;

        // Step 9: Write any completed blocks to output
        compressor.write_blocks_to(output.as_mut())?;

        // Write secondary data
        if !buffers.secondary.is_empty() {
            if let Some(ref mut sw) = secondary_writer {
                sw.write_raw_bytes(&buffers.secondary)?;
            }
        }

        progress.log_if_needed(record_count);
    }

    // Flush any remaining data in compression buffer
    compressor.flush()?;
    compressor.write_blocks_to(output.as_mut())?;

    // Flush output and write BGZF EOF marker
    output.flush()?;
    output.write_all(&BGZF_EOF)?;
    output.flush()?;

    // Finalize secondary output writer
    if let Some(writer) = secondary_writer {
        writer.finish().map_err(|e| {
            io::Error::new(e.kind(), format!("Failed to finalize secondary output: {e}"))
        })?;
    }

    Ok(progress.count())
}

// ============================================================================
// Public Run Function
// ============================================================================

/// Run the BAM pipeline.
///
/// # Type Parameters
///
/// - `G`: Group type produced by the grouper
/// - `P`: Processed type produced by the process function
///
/// # Arguments
///
/// - `config`: Pipeline configuration
/// - `input`: Input reader (e.g., BAM file)
/// - `output`: Output writer (e.g., BAM file)
/// - `grouper`: The grouper that groups decoded records
/// - `fns`: Step functions for processing and serialization
///
/// # Returns
///
/// Number of groups processed, or an error.
///
/// # Errors
///
/// Returns an I/O error if any pipeline step fails.
#[allow(clippy::too_many_lines, clippy::cast_possible_truncation)]
pub fn run_bam_pipeline<G, P>(
    config: PipelineConfig,
    input: Box<dyn Read + Send>,
    output: Box<dyn Write + Send>,
    grouper: Box<dyn Grouper<Group = G> + Send>,
    fns: PipelineFunctions<G, P>,
    group_key_config: GroupKeyConfig,
    secondary_writer: Option<crate::bam_io::RawBamWriter>,
) -> io::Result<u64>
where
    G: Send + BatchWeight + MemoryEstimate + 'static,
    P: Send + MemoryEstimate + 'static,
{
    let num_threads = config.num_threads;
    let compression_level = config.compression_level;
    let scheduler_strategy = config.scheduler_strategy;

    // ============================================================
    // Single-threaded fast path: avoid thread/queue overhead
    // ============================================================
    if num_threads == 1 {
        return run_bam_pipeline_single_threaded(
            &config,
            input,
            output,
            grouper,
            fns,
            group_key_config,
            secondary_writer,
        );
    }

    let mut state = BamPipelineState::<G, P>::new(config, input, output, group_key_config);
    if let Some(sw) = secondary_writer {
        state.output.set_secondary_output(sw);
    }
    let state = Arc::new(state);

    // Set num_threads for stats display
    if let Some(stats) = state.stats() {
        stats.set_num_threads(num_threads);
        #[cfg(feature = "memory-debug")]
        stats.set_infrastructure_memory(num_threads, state.config.queue_capacity);
    }

    let group_state = Arc::new(Mutex::new(GroupState::new(grouper)));
    let fns = Arc::new(fns);

    // Spawn worker threads
    // Thread 0 is the sticky reader, threads 1..N-1 are workers only
    let handles: Vec<_> = (0..num_threads)
        .map(|thread_id| {
            let state = Arc::clone(&state);
            let group_state = Arc::clone(&group_state);
            let fns = Arc::clone(&fns);

            thread::spawn(move || {
                // Wrap worker logic in catch_unwind to handle panics gracefully
                let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
                    let mut worker = WorkerState::new(
                        compression_level,
                        thread_id,
                        num_threads,
                        scheduler_strategy,
                    );
                    let ctx = BamStepContext {
                        state: &state,
                        group_state: &group_state,
                        fns: &fns,
                        is_reader: thread_id == 0,
                    };
                    generic_worker_loop(&ctx, &mut worker);
                }));

                // If a panic occurred, set the error flag so other threads exit
                if let Err(panic_info) = result {
                    handle_worker_panic(&*state, thread_id, panic_info);
                }
            })
        })
        .collect();

    // Spawn queue monitor thread if stats or deadlock detection are enabled
    let monitor_handle = if state.stats().is_some() || state.deadlock_state.is_enabled() {
        let state_clone = Arc::clone(&state);
        Some(thread::spawn(move || {
            let start_time = Instant::now();
            let mut deadlock_check_counter = 0u32;
            loop {
                // Sleep 100ms between samples
                thread::sleep(Duration::from_millis(100));

                // Exit if pipeline is done or has error
                if state_clone.is_complete() || state_clone.has_error() {
                    break;
                }

                // Collect queue sizes (needed for both stats and deadlock detection)
                let queue_sizes = [
                    state_clone.q1_raw_blocks.len(),
                    state_clone.q2_decompressed.len(),
                    state_clone.q2b_boundaries.len(),
                    state_clone.q3_decoded.len(),
                    state_clone.output.groups.len(),
                    state_clone.output.processed.len(),
                    state_clone.output.serialized.len(),
                    state_clone.output.compressed.len(),
                ];

                // Collect reorder buffer sizes and memory (need locks)
                let (q2_reorder_len, q2_reorder_mem) = {
                    let reorder = state_clone.q2_reorder.lock();
                    (reorder.len(), reorder.total_heap_size() as u64)
                };
                let (q3_reorder_len, q3_reorder_mem) = {
                    let reorder = state_clone.q3_reorder.lock();
                    (reorder.len(), reorder.total_heap_size() as u64)
                };
                let (q7_reorder_len, q7_reorder_mem) = {
                    let reorder = state_clone.output.write_reorder.lock();
                    (reorder.len(), reorder.total_heap_size() as u64)
                };
                let reorder_sizes = [q2_reorder_len, q3_reorder_len, q7_reorder_len];
                let reorder_memory_bytes = [q2_reorder_mem, q3_reorder_mem, q7_reorder_mem];

                // Queue memory from AtomicU64 counters
                // Q1: only tracked with memory-debug feature
                #[cfg(feature = "memory-debug")]
                let q1_mem = state_clone.q1_heap_bytes.load(Ordering::Relaxed);
                #[cfg(not(feature = "memory-debug"))]
                let q1_mem: u64 = 0;
                // Q2-Q3: reorder buffer heap_bytes tracked unconditionally (used for backpressure)
                let q2_mem = state_clone.q2_reorder_state.heap_bytes.load(Ordering::Relaxed);
                let q3_mem = state_clone.q3_reorder_state.heap_bytes.load(Ordering::Relaxed);
                // Q4: groups_heap_bytes is only mutated under memory-debug (reads 0 without feature)
                let q4_mem = state_clone.output.groups_heap_bytes.load(Ordering::Relaxed);
                // Q5-Q7: tracked unconditionally via OutputPipelineQueues atomic counters
                let q5_mem = state_clone.output.processed_heap_bytes.load(Ordering::Relaxed);
                let q6_mem = state_clone.output.serialized_heap_bytes.load(Ordering::Relaxed);
                let q7_mem = state_clone.output.compressed_heap_bytes.load(Ordering::Relaxed);
                let queue_memory_bytes =
                    [q1_mem, q2_mem, 0, q3_mem, q4_mem, q5_mem, q6_mem, q7_mem];

                // Collect thread activity
                let thread_steps: Vec<u8> = if let Some(stats) = state_clone.stats() {
                    let num_threads = stats.num_threads.load(Ordering::Relaxed) as usize;
                    (0..num_threads)
                        .map(|tid| stats.per_thread_current_step[tid].load(Ordering::Relaxed))
                        .collect()
                } else {
                    Vec::new()
                };

                // Record sample and track peak memory
                if let Some(stats) = state_clone.stats() {
                    // Track peak memory from all queues (reorder buffers + ArrayQueues)
                    let total_mem = q1_mem
                        + q2_mem
                        + q3_mem
                        + q7_reorder_mem
                        + q4_mem
                        + q5_mem
                        + q6_mem
                        + q7_mem;
                    stats.record_memory_usage(total_mem);

                    // Update shared PipelineStats with actual queue memory bytes
                    // so the memory debugging monitor can see them
                    #[cfg(feature = "memory-debug")]
                    stats.update_queue_memory_from_external(&[
                        ("q1", q1_mem),
                        ("q2", q2_mem),
                        ("q3", q3_mem),
                        ("q4", q4_mem),
                        ("q5", q5_mem),
                        ("q6", q6_mem),
                        ("q7", q7_mem),
                    ]);

                    stats.add_queue_sample(QueueSample {
                        time_ms: start_time.elapsed().as_millis() as u64,
                        queue_sizes,
                        reorder_sizes,
                        queue_memory_bytes,
                        reorder_memory_bytes,
                        thread_steps,
                    });
                }

                // Check for deadlock every ~1 second (10 iterations * 100ms)
                if state_clone.deadlock_state.is_enabled() {
                    deadlock_check_counter += 1;
                    if deadlock_check_counter >= 10 {
                        deadlock_check_counter = 0;
                        let snapshot = state_clone.build_queue_snapshot();
                        if let DeadlockAction::Detected =
                            check_deadlock_and_restore(&state_clone.deadlock_state, &snapshot)
                        {
                            state_clone.set_error(io::Error::new(
                                io::ErrorKind::TimedOut,
                                "pipeline deadlock detected with recovery disabled; \
                                 use --deadlock-recover to enable automatic recovery",
                            ));
                            break;
                        }
                    }
                }
            }
        }))
    } else {
        None
    };

    // Wait for all threads to complete
    join_worker_threads(handles)?;
    join_monitor_thread(monitor_handle);

    // Finalize: check errors, flush output, log stats
    let result = finalize_pipeline(&*state);

    // Finalize secondary output writer (if present)
    if let Some(ref secondary_mutex) = state.output.secondary_output {
        let mut guard = secondary_mutex.lock();
        if let Some(writer) = guard.take() {
            if let Err(e) = writer.finish().map_err(|e| {
                io::Error::new(e.kind(), format!("Failed to finalize secondary output: {e}"))
            }) {
                if result.is_err() {
                    log::error!("Secondary output finalization also failed: {e}");
                } else {
                    return Err(e);
                }
            }
        }
    }

    result
}

// ============================================================================
// BAM Pipeline Helpers
// ============================================================================

// Thread-local buffer for serializing BAM records.
// Reusing this buffer across calls avoids repeated allocations.
thread_local! {
    static SERIALIZE_RECORD_BUFFER: std::cell::RefCell<Vec<u8>> =
        std::cell::RefCell::new(Vec::with_capacity(512));
}

/// Serialize a batch of BAM records to bytes.
///
/// This function encodes multiple BAM records into a single byte buffer,
/// ready for BGZF compression. Uses thread-local buffer to avoid allocations.
/// Serialize BAM records into a provided buffer.
///
/// Appends serialized BAM bytes to the provided buffer and returns the record count.
/// This variant is more efficient when the caller wants to reuse a buffer.
///
/// # Errors
///
/// Returns an I/O error if record encoding fails.
#[allow(clippy::cast_possible_truncation)]
pub fn serialize_bam_records_into(
    records: &[RecordBuf],
    header: &Header,
    output: &mut Vec<u8>,
) -> io::Result<u64> {
    use crate::vendored::bam_codec::encode_record_buf;

    log::trace!("serialize_bam_records_into: {} records", records.len());

    // Pre-allocate output buffer based on batch size estimate
    // Typical record: ~300-500 bytes, plus 4-byte block_size prefix
    // Use 400 bytes as a reasonable average
    let estimated_batch_size = records.len() * 400;
    output.reserve(estimated_batch_size);

    SERIALIZE_RECORD_BUFFER.with(|buf| {
        let mut record_data = buf.borrow_mut();

        for (i, record) in records.iter().enumerate() {
            // Clear and reuse the buffer for each record
            record_data.clear();
            if let Err(e) = encode_record_buf(&mut record_data, header, record) {
                log::error!(
                    "serialize_bam_records_into: failed to encode record {}: {:?}, name={:?}, seq_len={}, qual_len={}",
                    i,
                    e,
                    record.name(),
                    record.sequence().len(),
                    record.quality_scores().len(),
                );
                return Err(e);
            }

            // Add `block_size` prefix + record data
            let block_size = record_data.len() as u32;
            output.extend_from_slice(&block_size.to_le_bytes());
            output.extend_from_slice(&record_data);
        }

        Ok(records.len() as u64)
    })
}

/// Serialize BAM records to a new `SerializedBatch`.
///
/// This is a convenience wrapper around `serialize_bam_records_into` that allocates
/// a new buffer. Use `serialize_bam_records_into` for better performance when
/// buffer reuse is possible.
///
/// # Errors
///
/// Returns an I/O error if record encoding fails.
pub fn serialize_bam_records(
    records: &[RecordBuf],
    header: &Header,
) -> io::Result<SerializedBatch> {
    let mut data = Vec::with_capacity(records.len() * 256);
    let record_count = serialize_bam_records_into(records, header, &mut data)?;
    Ok(SerializedBatch { data, record_count, secondary_data: None })
}

/// Serialize a single BAM record to bytes.
///
/// This produces raw BAM record bytes (`block_size` prefix + record data),
/// suitable for BGZF compression in the pipeline. Uses thread-local buffer.
///
/// # Errors
///
/// Returns an I/O error if record encoding fails.
pub fn serialize_bam_record(record: &RecordBuf, header: &Header) -> io::Result<SerializedBatch> {
    let mut data = Vec::with_capacity(256);
    let record_count = serialize_bam_record_into(record, header, &mut data)?;
    Ok(SerializedBatch { data, record_count, secondary_data: None })
}

/// Serialize a single BAM record to bytes, appending to the provided buffer.
///
/// This produces raw BAM record bytes (`block_size` prefix + record data),
/// suitable for BGZF compression in the pipeline. Uses thread-local buffer for encoding.
///
/// Returns the number of records serialized (always 1 for a single record).
///
/// # Errors
///
/// Returns an I/O error if record encoding fails.
#[allow(clippy::cast_possible_truncation)]
pub fn serialize_bam_record_into(
    record: &RecordBuf,
    header: &Header,
    output: &mut Vec<u8>,
) -> io::Result<u64> {
    use crate::vendored::bam_codec::encode_record_buf;

    SERIALIZE_RECORD_BUFFER.with(|buf| {
        let mut record_data = buf.borrow_mut();
        record_data.clear();
        encode_record_buf(&mut record_data, header, record)?;

        // Append `block_size` prefix + record data to output
        let block_size = record_data.len() as u32;
        output.extend_from_slice(&block_size.to_le_bytes());
        output.extend_from_slice(&record_data);

        Ok(1)
    })
}

/// Serialize BAM records directly to a BGZF compressor (zero-copy).
///
/// This writes records directly to the compressor's internal buffer, avoiding
/// the intermediate serialization buffer copy. Records are compressed into
/// BGZF blocks as the buffer fills.
///
/// Returns the number of records serialized.
///
/// # Errors
///
/// Returns an I/O error if record encoding or compression fails.
#[allow(clippy::cast_possible_truncation)]
pub fn serialize_bam_records_to_compressor(
    records: &[RecordBuf],
    header: &Header,
    compressor: &mut crate::bgzf_writer::InlineBgzfCompressor,
) -> io::Result<u64> {
    use crate::vendored::bam_codec::encode_record_buf;

    log::trace!("serialize_bam_records_to_compressor: {} records", records.len());

    SERIALIZE_RECORD_BUFFER.with(|buf| {
        let mut record_data = buf.borrow_mut();

        for (i, record) in records.iter().enumerate() {
            // Clear and reuse the buffer for each record
            record_data.clear();
            if let Err(e) = encode_record_buf(&mut record_data, header, record) {
                log::error!(
                    "serialize_bam_records_to_compressor: failed to encode record {}: {:?}, name={:?}, seq_len={}, qual_len={}",
                    i,
                    e,
                    record.name(),
                    record.sequence().len(),
                    record.quality_scores().len(),
                );
                return Err(e);
            }

            // Write block_size prefix + record data directly to compressor buffer
            let block_size = record_data.len() as u32;
            let buffer = compressor.buffer_mut();
            buffer.extend_from_slice(&block_size.to_le_bytes());
            buffer.extend_from_slice(&record_data);

            // Compress if buffer is full
            compressor.maybe_compress()?;
        }

        Ok(records.len() as u64)
    })
}

/// Configuration for running a BAM file through the pipeline.
#[derive(Debug, Clone)]
pub struct BamPipelineConfig {
    /// Base pipeline configuration.
    pub pipeline: PipelineConfig,
    /// Compression level for output (0-12).
    pub compression_level: u32,
    /// Configuration for computing `GroupKey` during decode.
    /// If None, a default config will be built from the header.
    pub group_key_config: Option<GroupKeyConfig>,
}

impl BamPipelineConfig {
    /// Create a new BAM pipeline configuration.
    #[must_use]
    pub fn new(num_threads: usize, compression_level: u32) -> Self {
        Self {
            pipeline: PipelineConfig::new(num_threads, compression_level),
            compression_level,
            group_key_config: None,
        }
    }

    /// Create a configuration auto-tuned for the given thread count.
    ///
    /// This adjusts queue capacity and batch sizes based on the number of threads
    /// to optimize throughput and reduce contention.
    #[must_use]
    pub fn auto_tuned(num_threads: usize, compression_level: u32) -> Self {
        Self {
            pipeline: PipelineConfig::auto_tuned(num_threads, compression_level),
            compression_level,
            group_key_config: None,
        }
    }

    /// Set the compression level.
    #[must_use]
    pub fn with_compression_level(mut self, level: u32) -> Self {
        self.compression_level = level;
        self.pipeline.compression_level = level;
        self
    }

    /// Set the `GroupKey` configuration for position-based grouping.
    #[must_use]
    pub fn with_group_key_config(mut self, config: GroupKeyConfig) -> Self {
        self.group_key_config = Some(config);
        self
    }
}

/// Open an output writer for pipeline use, supporting stdout via "-" or "/dev/stdout".
fn open_pipeline_output(output_path: &Path) -> io::Result<Box<dyn Write + Send>> {
    if is_stdout_path(output_path) {
        Ok(Box::new(std::io::stdout()))
    } else {
        let file = File::create(output_path)
            .map_err(|e| io::Error::new(e.kind(), format!("Failed to create output: {e}")))?;
        Ok(Box::new(file))
    }
}

/// Run a BAM file through the pipeline with a grouper factory.
///
/// This is a convenience function that handles BAM header I/O and
/// sets up the pipeline correctly for BAM processing.
///
/// # Type Parameters
///
/// - `G`: Group type produced by the grouper (e.g., `RecordBuf`, `RawPositionGroup`, `MiGroup`)
/// - `P`: Processed type produced by the process function (e.g., `Vec<RecordBuf>`)
///
/// # Arguments
///
/// - `config`: Pipeline configuration
/// - `input_path`: Path to input BAM file
/// - `output_path`: Path to output BAM file
/// - `grouper_fn`: Function that creates a grouper given the header
/// - `process_fn`: Function to process each group
/// - `serialize_fn`: Function to serialize processed output (receives header reference and output buffer)
///
/// # Returns
///
/// Number of groups processed, or an error.
///
/// # Errors
///
/// Returns an I/O error if any pipeline step or file I/O fails.
pub fn run_bam_pipeline_with_grouper<G, P, GrouperFn, ProcessFn, SerializeFn>(
    config: BamPipelineConfig,
    input_path: &Path,
    output_path: &Path,
    grouper_fn: GrouperFn,
    process_fn: ProcessFn,
    serialize_fn: SerializeFn,
) -> io::Result<u64>
where
    G: Send + BatchWeight + MemoryEstimate + 'static,
    P: Send + MemoryEstimate + 'static,
    GrouperFn: FnOnce(&Header) -> Box<dyn Grouper<Group = G> + Send>,
    ProcessFn: Fn(G) -> io::Result<P> + Send + Sync + 'static,
    SerializeFn: Fn(P, &Header, &mut Vec<u8>) -> io::Result<u64> + Send + Sync + 'static,
{
    // First pass: Read header only (we need it for grouper creation and output writing)
    let header = {
        let input_file = File::open(input_path)
            .map_err(|e| io::Error::new(e.kind(), format!("Failed to open input: {e}")))?;
        let mut bam_reader = bam::io::Reader::new(input_file);
        bam_reader.read_header().map_err(|e| {
            io::Error::new(io::ErrorKind::InvalidData, format!("Failed to read BAM header: {e}"))
        })?
    };

    // Create output writer (supports stdout via "-" or "/dev/stdout")
    let output_writer = open_pipeline_output(output_path)?;

    // Write BAM header using BGZF compression
    let mut header_writer = bam::io::Writer::new(output_writer);
    header_writer
        .write_header(&header)
        .map_err(|e| io::Error::other(format!("Failed to write BAM header: {e}")))?;

    // Finish the BGZF writer and get the underlying writer for the pipeline.
    // We need to:
    // 1. Get the BGZF writer from the BAM writer
    // 2. Flush/finish the BGZF stream (writes any pending data)
    // 3. Get the underlying writer
    // This ensures the pipeline writes raw BGZF blocks directly to the output,
    // not through another BGZF compression layer.
    let mut bgzf_writer = header_writer.into_inner();
    bgzf_writer
        .try_finish()
        .map_err(|e| io::Error::other(format!("Failed to finish BGZF header: {e}")))?;
    let output = bgzf_writer.into_inner();

    // Re-open input file for pipeline (raw file reader for BGZF block reading)
    // Wrap in BufReader to reduce syscalls
    let input = File::open(input_path)
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to re-open input: {e}")))?;
    let input = BufReader::with_capacity(IO_BUFFER_SIZE, input);

    let output = BufWriter::with_capacity(IO_BUFFER_SIZE, output);

    // Build GroupKeyConfig from header if not provided
    let group_key_config = config.group_key_config.unwrap_or_else(|| {
        let library_index = LibraryIndex::from_header(&header);
        let cell_tag = Tag::from(SamTag::CB);
        GroupKeyConfig::new(library_index, cell_tag)
    });

    // Create the grouper
    // Note: Header skipping is now handled by BoundaryState in the pipeline
    let grouper = grouper_fn(&header);

    // Create step functions with header captured
    let header_clone = header.clone();
    let fns = PipelineFunctions::new(process_fn, move |p: P, buf: &mut Vec<u8>| {
        serialize_fn(p, &header_clone, buf)
    });

    run_bam_pipeline(
        config.pipeline,
        Box::new(input),
        Box::new(output),
        grouper,
        fns,
        group_key_config,
        None,
    )
}

/// Run a BAM file through the pipeline with a custom output header.
///
/// This variant allows specifying a different output header than the input header,
/// useful for commands like `simplex` that produce unmapped consensus reads.
///
/// # Type Parameters
///
/// - `G`: Group type produced by the grouper (e.g., `RecordBuf`, `RawPositionGroup`, `MiGroup`)
/// - `P`: Processed type produced by the process function (e.g., `Vec<RecordBuf>`)
///
/// # Arguments
///
/// - `config`: Pipeline configuration
/// - `input_path`: Path to input BAM file
/// - `output_path`: Path to output BAM file
/// - `output_header`: Custom header to write to output file (and use for serialization)
/// - `grouper_fn`: Function that creates a grouper given the input header
/// - `process_fn`: Function to process each group
/// - `serialize_fn`: Function to serialize processed output (receives output header reference)
///
/// # Returns
///
/// Number of groups processed, or an error.
///
/// # Errors
///
/// Returns an I/O error if any pipeline step or file I/O fails.
pub fn run_bam_pipeline_with_header<G, P, GrouperFn, ProcessFn, SerializeFn>(
    config: BamPipelineConfig,
    input_path: &Path,
    output_path: &Path,
    output_header: Header,
    grouper_fn: GrouperFn,
    process_fn: ProcessFn,
    serialize_fn: SerializeFn,
) -> io::Result<u64>
where
    G: Send + BatchWeight + MemoryEstimate + 'static,
    P: Send + MemoryEstimate + 'static,
    GrouperFn: FnOnce(&Header) -> Box<dyn Grouper<Group = G> + Send>,
    ProcessFn: Fn(G) -> io::Result<P> + Send + Sync + 'static,
    SerializeFn: Fn(P, &Header, &mut Vec<u8>) -> io::Result<u64> + Send + Sync + 'static,
{
    // First pass: Read input header only (for grouper creation)
    let input_header = {
        let input_file = File::open(input_path)
            .map_err(|e| io::Error::new(e.kind(), format!("Failed to open input: {e}")))?;
        let mut bam_reader = bam::io::Reader::new(input_file);
        bam_reader.read_header().map_err(|e| {
            io::Error::new(io::ErrorKind::InvalidData, format!("Failed to read BAM header: {e}"))
        })?
    };

    // Create output writer (supports stdout via "-" or "/dev/stdout")
    let output_writer = open_pipeline_output(output_path)?;

    // Write BAM header using BGZF compression
    let mut header_writer = bam::io::Writer::new(output_writer);
    header_writer
        .write_header(&output_header)
        .map_err(|e| io::Error::other(format!("Failed to write BAM header: {e}")))?;

    // Finish the BGZF writer and get the underlying writer for the pipeline.
    // We need to:
    // 1. Get the BGZF writer from the BAM writer
    // 2. Flush/finish the BGZF stream (writes any pending data)
    // 3. Get the underlying writer
    let mut bgzf_writer = header_writer.into_inner();
    bgzf_writer
        .try_finish()
        .map_err(|e| io::Error::other(format!("Failed to finish BGZF header: {e}")))?;
    let output = bgzf_writer.into_inner();

    // Re-open input file for pipeline (raw file reader for BGZF block reading)
    // Wrap in BufReader to reduce syscalls
    let input = File::open(input_path)
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to re-open input: {e}")))?;
    let input = BufReader::with_capacity(IO_BUFFER_SIZE, input);

    let output = BufWriter::with_capacity(IO_BUFFER_SIZE, output);

    // Build GroupKeyConfig from input header if not provided
    let group_key_config = config.group_key_config.unwrap_or_else(|| {
        let library_index = LibraryIndex::from_header(&input_header);
        let cell_tag = Tag::from(SamTag::CB);
        GroupKeyConfig::new(library_index, cell_tag)
    });

    // Create the grouper using INPUT header
    // Note: Header skipping is now handled by BoundaryState in the pipeline
    let grouper = grouper_fn(&input_header);

    // Create step functions with OUTPUT header captured (for serialization)
    let fns = PipelineFunctions::new(process_fn, move |p: P, buf: &mut Vec<u8>| {
        serialize_fn(p, &output_header, buf)
    });

    run_bam_pipeline(
        config.pipeline,
        Box::new(input),
        Box::new(output),
        grouper,
        fns,
        group_key_config,
        None,
    )
}

// ============================================================================
// Reader-based Pipeline Functions (for streaming support)
// ============================================================================

/// Run a BAM pipeline from an already-opened reader.
///
/// This variant accepts a pre-opened reader and header, enabling streaming from
/// stdin or other non-seekable sources.
///
/// # Type Parameters
///
/// - `G`: Group type produced by the grouper (e.g., `RecordBuf`, `RawPositionGroup`, `MiGroup`)
/// - `P`: Processed type produced by the process function (e.g., `Vec<RecordBuf>`)
/// - `R`: Reader type that implements `Read + Send`
///
/// # Arguments
///
/// - `config`: Pipeline configuration
/// - `input`: Pre-opened input reader (header already read)
/// - `input_header`: Header that was read from the input (used for grouping)
/// - `output_path`: Path to output BAM file
/// - `output_header`: Optional custom header for output file and serialization.
///   If `None`, uses `input_header` for both.
/// - `grouper_fn`: Function that creates a grouper given the input header
/// - `process_fn`: Function to process each group
/// - `serialize_fn`: Function to serialize processed output (receives output header reference)
///
/// # Returns
///
/// Number of groups processed, or an error.
///
/// # Errors
///
/// Returns an I/O error if any pipeline step or file I/O fails.
#[allow(clippy::too_many_arguments, clippy::needless_pass_by_value)]
pub fn run_bam_pipeline_from_reader<G, P, R, GrouperFn, ProcessFn, SerializeFn>(
    config: BamPipelineConfig,
    input: R,
    input_header: Header,
    output_path: &Path,
    output_header: Option<Header>,
    grouper_fn: GrouperFn,
    process_fn: ProcessFn,
    serialize_fn: SerializeFn,
) -> io::Result<u64>
where
    G: Send + BatchWeight + MemoryEstimate + 'static,
    P: Send + MemoryEstimate + 'static,
    R: Read + Send + 'static,
    GrouperFn: FnOnce(&Header) -> Box<dyn Grouper<Group = G> + Send>,
    ProcessFn: Fn(G) -> io::Result<P> + Send + Sync + 'static,
    SerializeFn: Fn(P, &Header, &mut Vec<u8>) -> io::Result<u64> + Send + Sync + 'static,
{
    // Use output_header if provided, otherwise clone input_header
    let output_header = output_header.unwrap_or_else(|| input_header.clone());

    // Create output writer (supports stdout via "-" or "/dev/stdout")
    let output_writer = open_pipeline_output(output_path)?;

    // Write BAM header using BGZF compression
    let mut header_writer = bam::io::Writer::new(output_writer);
    header_writer
        .write_header(&output_header)
        .map_err(|e| io::Error::other(format!("Failed to write BAM header: {e}")))?;

    // Finish the BGZF writer and get the underlying writer for the pipeline.
    let mut bgzf_writer = header_writer.into_inner();
    bgzf_writer
        .try_finish()
        .map_err(|e| io::Error::other(format!("Failed to finish BGZF header: {e}")))?;
    let output = bgzf_writer.into_inner();

    let output = BufWriter::with_capacity(IO_BUFFER_SIZE, output);

    // Build GroupKeyConfig from input header if not provided
    let group_key_config = config.group_key_config.unwrap_or_else(|| {
        let library_index = LibraryIndex::from_header(&input_header);
        let cell_tag = Tag::from(SamTag::CB);
        GroupKeyConfig::new(library_index, cell_tag)
    });

    // Create the grouper using INPUT header
    let grouper = grouper_fn(&input_header);

    // Create step functions with OUTPUT header captured (for serialization)
    let fns = PipelineFunctions::new(process_fn, move |p: P, buf: &mut Vec<u8>| {
        serialize_fn(p, &output_header, buf)
    });

    run_bam_pipeline(
        config.pipeline,
        Box::new(input),
        Box::new(output),
        grouper,
        fns,
        group_key_config,
        None,
    )
}

/// Run a BAM pipeline from an already-opened reader, with a secondary output file.
///
/// This variant routes rejected/secondary records through the pipeline's ordering
/// infrastructure so both primary and secondary output files maintain input order.
///
/// The secondary serialize function is called with a borrow of the processed batch
/// BEFORE the primary serialize function consumes it.
///
/// # Errors
///
/// Returns an I/O error if any pipeline step or file I/O fails.
#[allow(clippy::too_many_arguments, clippy::needless_pass_by_value)]
pub fn run_bam_pipeline_from_reader_with_secondary<
    G,
    P,
    R,
    GrouperFn,
    ProcessFn,
    SerializeFn,
    SecondaryFn,
>(
    config: BamPipelineConfig,
    input: R,
    input_header: Header,
    output_path: &Path,
    output_header: Option<Header>,
    secondary_output_path: &Path,
    grouper_fn: GrouperFn,
    process_fn: ProcessFn,
    serialize_fn: SerializeFn,
    secondary_serialize_fn: SecondaryFn,
) -> io::Result<u64>
where
    G: Send + BatchWeight + MemoryEstimate + 'static,
    P: Send + MemoryEstimate + 'static,
    R: Read + Send + 'static,
    GrouperFn: FnOnce(&Header) -> Box<dyn Grouper<Group = G> + Send>,
    ProcessFn: Fn(G) -> io::Result<P> + Send + Sync + 'static,
    SerializeFn: Fn(P, &Header, &mut Vec<u8>) -> io::Result<u64> + Send + Sync + 'static,
    SecondaryFn: Fn(&P, &mut Vec<u8>) -> io::Result<u64> + Send + Sync + 'static,
{
    // Use output_header if provided, otherwise clone input_header
    let output_header = output_header.unwrap_or_else(|| input_header.clone());

    // Create primary output BAM and write the output header
    let output_writer = open_pipeline_output(output_path)?;

    let mut header_writer = bam::io::Writer::new(output_writer);
    header_writer
        .write_header(&output_header)
        .map_err(|e| io::Error::other(format!("Failed to write BAM header: {e}")))?;

    let mut bgzf_writer = header_writer.into_inner();
    bgzf_writer
        .try_finish()
        .map_err(|e| io::Error::other(format!("Failed to finish BGZF header: {e}")))?;
    let output = bgzf_writer.into_inner();

    let output = BufWriter::with_capacity(IO_BUFFER_SIZE, output);

    // Create secondary output (reject BAM) with its own BGZF compression
    let secondary_writer = crate::bam_io::create_raw_bam_writer(
        secondary_output_path,
        &output_header,
        1, // single-threaded BGZF for secondary
        config.compression_level,
    )
    .map_err(|e| io::Error::other(format!("Failed to create secondary output: {e}")))?;

    // Build GroupKeyConfig
    let group_key_config = config.group_key_config.unwrap_or_else(|| {
        let library_index = LibraryIndex::from_header(&input_header);
        let cell_tag = Tag::from(SamTag::CB);
        GroupKeyConfig::new(library_index, cell_tag)
    });

    let grouper = grouper_fn(&input_header);

    let fns = PipelineFunctions::new(process_fn, move |p: P, buf: &mut Vec<u8>| {
        serialize_fn(p, &output_header, buf)
    })
    .with_secondary_serialize(secondary_serialize_fn);

    let pipeline_config = config.pipeline;

    run_bam_pipeline(
        pipeline_config,
        Box::new(input),
        Box::new(output),
        grouper,
        fns,
        group_key_config,
        Some(secondary_writer),
    )
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::read_info::LibraryIndex;

    /// Create a minimal `BamPipelineState` for testing memory backpressure.
    fn create_test_state(memory_limit: u64) -> BamPipelineState<(), ()> {
        let config = PipelineConfig::new(2, 6).with_queue_memory_limit(memory_limit);
        let input: Box<dyn Read + Send> = Box::new(std::io::empty());
        let output: Box<dyn Write + Send> = Box::new(std::io::sink());
        // Create minimal GroupKeyConfig for testing
        let header = Header::default();
        let library_index = LibraryIndex::from_header(&header);
        let group_key_config = GroupKeyConfig::new(library_index, SamTag::CB.into());
        BamPipelineState::new(config, input, output, group_key_config)
    }

    #[test]
    fn test_can_decompress_proceed_no_limit() {
        let state = create_test_state(0); // No limit
        // Should always proceed when no limit
        assert!(state.can_decompress_proceed(0));
        assert!(state.can_decompress_proceed(100));
    }

    #[test]
    fn test_can_decompress_proceed_under_limit() {
        let state = create_test_state(1024 * 1024); // 1MB limit
        // Under 50% of limit, should proceed
        state.q2_reorder_state.heap_bytes.store(100_000, Ordering::SeqCst);
        assert!(state.can_decompress_proceed(5));
    }

    #[test]
    fn test_can_decompress_proceed_over_limit_but_needed_serial() {
        let state = create_test_state(1024 * 1024); // 1MB limit
        // Over 50% of limit
        state.q2_reorder_state.heap_bytes.store(600_000, Ordering::SeqCst);
        state.q2_reorder_state.next_seq.store(5, Ordering::SeqCst);
        // Should still proceed for the needed serial (deadlock prevention)
        assert!(state.can_decompress_proceed(5));
        // But not for other serials
        assert!(!state.can_decompress_proceed(6));
        assert!(!state.can_decompress_proceed(10));
    }

    #[test]
    fn test_can_decompress_proceed_over_limit() {
        let state = create_test_state(1024 * 1024); // 1MB limit
        // Over 50% of limit
        state.q2_reorder_state.heap_bytes.store(600_000, Ordering::SeqCst);
        state.q2_reorder_state.next_seq.store(0, Ordering::SeqCst);
        // Should not proceed for non-needed serials
        assert!(!state.can_decompress_proceed(5));
    }

    #[test]
    fn test_can_decode_proceed_no_limit() {
        let state = create_test_state(0); // No limit (uses default 512MB threshold)
        // Under default threshold, should proceed
        assert!(state.can_decode_proceed(0));
        assert!(state.can_decode_proceed(100));
    }

    #[test]
    fn test_can_decode_proceed_under_limit() {
        let state = create_test_state(1024 * 1024); // 1MB limit
        // Under 50% of limit, should proceed
        state.q3_reorder_state.heap_bytes.store(100_000, Ordering::SeqCst);
        assert!(state.can_decode_proceed(5));
    }

    #[test]
    fn test_can_decode_proceed_over_limit_but_needed_serial() {
        let state = create_test_state(1024 * 1024); // 1MB limit
        // Over 50% of limit
        state.q3_reorder_state.heap_bytes.store(600_000, Ordering::SeqCst);
        state.q3_reorder_state.next_seq.store(5, Ordering::SeqCst);
        // Should still proceed for the needed serial (deadlock prevention)
        assert!(state.can_decode_proceed(5));
        // But not for other serials
        assert!(!state.can_decode_proceed(6));
    }

    #[test]
    fn test_is_memory_high_threshold() {
        let state = create_test_state(1024 * 1024 * 1024); // 1GB limit (uses 512MB cap)
        // Under 512MB threshold
        state.q3_reorder_state.heap_bytes.store(500 * 1024 * 1024, Ordering::SeqCst);
        assert!(!state.is_memory_high());
        // At 512MB threshold
        state.q3_reorder_state.heap_bytes.store(512 * 1024 * 1024, Ordering::SeqCst);
        assert!(state.is_memory_high());
    }

    #[test]
    fn test_is_memory_drained_threshold() {
        let state = create_test_state(1024 * 1024 * 1024); // 1GB limit (uses 512MB cap)
        // Under 256MB (half of 512MB), should be drained
        state.q3_reorder_state.heap_bytes.store(200 * 1024 * 1024, Ordering::SeqCst);
        assert!(state.is_memory_drained());
        // At 256MB, not drained
        state.q3_reorder_state.heap_bytes.store(256 * 1024 * 1024, Ordering::SeqCst);
        assert!(!state.is_memory_drained());
    }

    // ========================================================================
    // Pipeline Validation Tests
    // ========================================================================

    #[test]
    fn test_validation_passes_when_complete() {
        let state = create_test_state(0);
        // Set all done flags
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);
        // Counters are all 0, queues are all empty
        let result = state.validate_completion();
        assert!(result.is_ok(), "Validation should pass: {result:?}");
    }

    #[test]
    fn test_validation_detects_non_empty_q1() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Add item to q1_raw_blocks
        let batch = RawBlockBatch { blocks: vec![] };
        assert!(state.q1_raw_blocks.push((0, batch)).is_ok());

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.non_empty_queues.iter().any(|s| s.contains("q1_raw_blocks")));
    }

    #[test]
    fn test_validation_detects_non_empty_q2() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Add item to q2_decompressed
        let batch = DecompressedBatch { data: vec![] };
        assert!(state.q2_decompressed.push((0, batch)).is_ok());

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.non_empty_queues.iter().any(|s| s.contains("q2_decompressed")));
    }

    #[test]
    fn test_validation_detects_counter_mismatch_decompressed() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Simulate: read 5 batches, but only decompressed 3
        state.next_read_serial.store(5, Ordering::SeqCst);
        state.batches_decompressed.store(3, Ordering::SeqCst);

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.counter_mismatches.iter().any(|s| s.contains("batches_decompressed")));
    }

    #[test]
    fn test_validation_detects_counter_mismatch_boundary_processed() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // All decompressed but not all boundary processed
        state.next_read_serial.store(5, Ordering::SeqCst);
        state.batches_decompressed.store(5, Ordering::SeqCst);
        state.batches_boundary_processed.store(3, Ordering::SeqCst);

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.counter_mismatches.iter().any(|s| s.contains("batches_boundary_processed")));
    }

    #[test]
    fn test_validation_detects_counter_mismatch_grouped() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Everything processed up to group, but group didn't finish
        state.next_read_serial.store(5, Ordering::SeqCst);
        state.batches_decompressed.store(5, Ordering::SeqCst);
        state.batches_boundary_processed.store(5, Ordering::SeqCst);
        state.batches_boundary_found.store(5, Ordering::SeqCst);
        state.batches_decoded.store(5, Ordering::SeqCst);
        state.batches_grouped.store(3, Ordering::SeqCst);

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.counter_mismatches.iter().any(|s| s.contains("batches_grouped")));
    }

    #[test]
    fn test_validation_error_display() {
        let err = PipelineValidationError {
            non_empty_queues: vec!["q1 (5)".to_string(), "q2 (3)".to_string()],
            counter_mismatches: vec!["batches_x (5) != batches_y (3)".to_string()],
            leaked_heap_bytes: 0,
        };
        let display = err.to_string();
        assert!(display.contains("q1"));
        assert!(display.contains("q2"));
        assert!(display.contains("batches_x"));
    }

    #[test]
    fn test_validation_detects_non_empty_reorder_buffer() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Add item to q2_reorder buffer
        {
            let mut q2_reorder = state.q2_reorder.lock();
            let batch = DecompressedBatch { data: vec![] };
            q2_reorder.insert(0, batch);
        }

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.non_empty_queues.iter().any(|s| s.contains("q2_reorder")));
    }

    #[test]
    fn test_validation_detects_non_empty_q3_reorder() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Add item to q3_reorder buffer
        {
            let mut q3_reorder = state.q3_reorder.lock();
            let batch: Vec<DecodedRecord> = vec![];
            q3_reorder.insert(0, batch);
        }

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.non_empty_queues.iter().any(|s| s.contains("q3_reorder")));
    }

    #[test]
    fn test_validation_detects_non_empty_output_queue() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Add item to output.groups (q4_groups)
        // G = () for create_test_state
        let batch: Vec<()> = vec![()];
        assert!(state.output.groups.push((0, batch)).is_ok());

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.non_empty_queues.iter().any(|s| s.contains("q4_groups")));
    }

    #[test]
    fn test_validation_detects_non_empty_write_reorder() {
        let state = create_test_state(0);
        state.read_done.store(true, Ordering::SeqCst);
        state.group_done.store(true, Ordering::SeqCst);

        // Add item to write_reorder buffer
        {
            let mut write_reorder = state.output.write_reorder.lock();
            let batch =
                CompressedBlockBatch { blocks: vec![], record_count: 0, secondary_data: None };
            write_reorder.insert(0, batch);
        }

        let result = state.validate_completion();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.non_empty_queues.iter().any(|s| s.contains("write_reorder")));
    }

    /// Create a worker for testing decompress/decode steps
    /// (`compression_level=6`, `thread_id=0`, `num_threads=2`).
    fn create_test_worker() -> WorkerState<()> {
        WorkerState::new(6, 0, 2, SchedulerStrategy::default())
    }

    /// Seed a reorder buffer state with a held-item stress scenario: a
    /// non-next_seq serial is waiting to land while the reorder heap is
    /// under partial memory pressure. `next_seq = 0` (so any held serial
    /// other than 0 is "out of order"), and `heap_bytes = 800` — above the
    /// 50% mark of the `memory_limit = 1024` used by these tests but below
    /// the full `is_memory_high` threshold, so the P2 memory gate does NOT
    /// fire. This lets the held-item P1 unconditional-push path be exercised
    /// in isolation: Priority 1 must admit the out-of-order held batch, and
    /// Priority 2 must still allow Q1/Q2b to drain (because memory is not
    /// high).
    fn setup_memory_backpressure(reorder_state: &ReorderBufferState) {
        reorder_state.heap_bytes.store(800, Ordering::SeqCst);
        reorder_state.next_seq.store(0, Ordering::SeqCst);
    }

    /// Held batch pushes unconditionally at Priority 1 when Q2 has physical
    /// capacity, regardless of memory backpressure. This prevents the deadlock
    /// where all workers hold non-next_seq batches and nobody can produce
    /// `next_seq`.
    #[test]
    fn test_decompress_held_pushes_unconditionally_when_q2_has_room() {
        let state = create_test_state(1024);
        setup_memory_backpressure(&state.q2_reorder_state);

        let raw = RawBlockBatch::new();
        assert!(state.q1_raw_blocks.push((0, raw)).is_ok());

        // Worker holds serial 50 — would be blocked by can_proceed (50 != next_seq 0),
        // but Priority 1 now pushes unconditionally.
        let mut worker = create_test_worker();
        worker.held_decompressed = Some((50, DecompressedBatch { data: vec![0u8; 16] }, 16));

        let result = try_step_decompress(&state, &mut worker);
        assert!(result, "should succeed — held batch pushed, then new batch processed");
        assert!(!state.has_error(), "should not error");
        assert!(state.q1_raw_blocks.is_empty(), "Q1 should have been popped");
        // Both the held batch (serial 50) and the new batch were pushed to Q2
        assert_eq!(state.q2_decompressed.len(), 2, "Q2 should have both batches");
        assert!(worker.held_decompressed.is_none(), "held slot should be empty");
    }

    /// When Q2 is physically full, the held batch cannot push at Priority 1.
    /// The function returns false without error and without popping Q1.
    #[test]
    fn test_decompress_held_blocked_by_full_q2() {
        let state = create_test_state(1024);

        let cap = state.q2_decompressed.capacity();
        for i in 0..cap {
            assert!(
                state
                    .q2_decompressed
                    .push((i as u64, DecompressedBatch { data: vec![0u8; 8] }))
                    .is_ok(),
                "failed to fill q2 at serial {i}"
            );
        }
        assert!(state.q2_decompressed.is_full());

        assert!(state.q1_raw_blocks.push((100, RawBlockBatch::new())).is_ok());

        let mut worker = create_test_worker();
        worker.held_decompressed = Some((50, DecompressedBatch { data: vec![0u8; 16] }, 16));

        let result = try_step_decompress(&state, &mut worker);
        assert!(!result, "should return false when Q2 is full");
        assert!(!state.has_error(), "should NOT set error; got: {:?}", state.take_error());
        assert!(worker.held_decompressed.is_some(), "held batch should be preserved");
        assert!(!state.q1_raw_blocks.is_empty(), "Q1 should not have been popped");
    }

    /// Held batch pushes unconditionally at Priority 1 when Q3 has physical
    /// capacity (symmetric to the decompress test).
    #[test]
    fn test_decode_held_pushes_unconditionally_when_q3_has_room() {
        let state = create_test_state(1024);
        setup_memory_backpressure(&state.q3_reorder_state);

        let boundary = BoundaryBatch { buffer: Vec::new(), offsets: vec![0] };
        assert!(state.q2b_boundaries.push((0, boundary)).is_ok());

        let mut worker = create_test_worker();
        worker.held_decoded = Some((50, vec![], 16));

        let result = try_step_decode(&state, &mut worker);
        assert!(result, "should succeed — held batch pushed, then new batch processed");
        assert!(!state.has_error(), "should not error");
        assert!(state.q2b_boundaries.is_empty(), "Q2b should have been popped");
        // Both the held batch and the new batch were pushed to Q3
        assert_eq!(state.q3_decoded.len(), 2, "Q3 should have both batches");
        assert!(worker.held_decoded.is_none(), "held slot should be empty");
    }

    /// When Q3 is physically full, the held batch cannot push at Priority 1.
    /// The function returns false without error.
    #[test]
    fn test_decode_held_blocked_by_full_q3() {
        let state = create_test_state(1024);

        let cap = state.q3_decoded.capacity();
        for i in 0..cap {
            assert!(
                state.q3_decoded.push((i as u64, vec![])).is_ok(),
                "failed to fill q3 at serial {i}"
            );
        }
        assert!(state.q3_decoded.is_full());

        let boundary = BoundaryBatch { buffer: Vec::new(), offsets: vec![0] };
        assert!(state.q2b_boundaries.push((100, boundary)).is_ok());

        let mut worker = create_test_worker();
        worker.held_decoded = Some((50, vec![], 16));

        let result = try_step_decode(&state, &mut worker);
        assert!(!result, "should return false when Q3 is full");
        assert!(!state.has_error(), "should NOT set error; got: {:?}", state.take_error());
        assert!(worker.held_decoded.is_some(), "held batch should be preserved");
        assert!(!state.q2b_boundaries.is_empty(), "Q2b should not have been popped");
    }

    #[test]
    fn test_pipeline_functions_secondary_serialize() {
        let fns = PipelineFunctions::<Vec<u8>, Vec<u8>>::new(Ok, |data, buf| {
            buf.extend_from_slice(&data);
            Ok(1)
        });
        assert!(fns.secondary_serialize_fn.is_none());

        let fns = fns.with_secondary_serialize(|data: &Vec<u8>, buf: &mut Vec<u8>| {
            buf.extend_from_slice(data);
            Ok(1)
        });
        assert!(fns.secondary_serialize_fn.is_some());

        // Verify the secondary serialize function works
        let test_data = vec![1u8, 2, 3, 4];
        let mut buf = Vec::new();
        let count =
            (fns.secondary_serialize_fn.as_ref().expect("secondary_serialize_fn should be set"))(
                &test_data, &mut buf,
            )
            .expect("serialize should succeed");
        assert_eq!(count, 1);
        assert_eq!(buf, vec![1, 2, 3, 4]);
    }
}