genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
//! Pushdown optimization support for VCF reader
//!
//! This module adds the ability to apply filters at the raw line level
//! before parsing, which can provide significant speedups (2-10x) when
//! filtering large VCF files.

use crate::error::Result;
use crate::filters::{PushdownFilter, RecordFilter};
use crate::formats::vcf::{VcfReader, VcfRecord};

/// Statistics tracked during pushdown execution
#[derive(Debug, Default, Clone)]
pub struct PushdownStats {
    /// Total lines read
    pub total_lines: usize,
    /// Lines that passed raw filter (not parsed)
    pub pushdown_hits: usize,
    /// Lines that required full parsing
    pub full_parses: usize,
    /// Lines that passed filter after parsing
    pub filter_passed: usize,
}

impl PushdownStats {
    /// Calculate the speedup from pushdown optimization
    ///
    /// Returns ratio of lines saved from parsing
    pub fn speedup(&self) -> f64 {
        if self.total_lines == 0 {
            return 1.0;
        }
        self.total_lines as f64 / self.full_parses.max(1) as f64
    }

    /// Print a summary of pushdown performance
    pub fn print_summary(&self) {
        println!("Pushdown Statistics:");
        println!("  Total lines: {}", self.total_lines);
        println!(
            "  Pushdown hits: {} ({:.1}%)",
            self.pushdown_hits,
            100.0 * self.pushdown_hits as f64 / self.total_lines.max(1) as f64
        );
        println!("  Full parses: {}", self.full_parses);
        println!("  Filter passed: {}", self.filter_passed);
        println!("  Speedup: {:.2}x", self.speedup());
    }
}

/// Extension trait for VcfReader with pushdown support
pub trait VcfReaderPushdown {
    /// Enable pushdown filtering with statistics tracking
    ///
    /// This applies the filter at the raw line level before parsing,
    /// which can provide significant speedups for selective queries.
    ///
    /// # Example
    ///
    /// ```no_run
    /// use genomicframe_core::formats::vcf::VcfReader;
    /// use genomicframe_core::formats::vcf::filters::QualityFilter;
    /// use genomicframe_core::formats::vcf::pushdown::VcfReaderPushdown;
    /// use genomicframe_core::core::GenomicRecordIterator;
    ///
    /// let mut reader = VcfReader::from_path("variants.vcf.gz")?;
    /// let filter = QualityFilter { min_qual: 30.0 };
    ///
    /// let (records, stats) = reader.with_pushdown(filter)?;
    ///
    /// for record in records {
    ///     // Only high-quality variants, parsed efficiently
    /// }
    ///
    /// stats.print_summary(); // Show speedup metrics
    /// # Ok::<(), genomicframe_core::error::Error>(())
    /// ```
    fn with_pushdown<F>(self, filter: F) -> Result<(PushdownIterator<F>, PushdownStats)>
    where
        F: PushdownFilter<VcfRecord>;
}

/// Iterator that applies pushdown filtering
pub struct PushdownIterator<F>
where
    F: PushdownFilter<VcfRecord>,
{
    reader: VcfReader,
    filter: F,
    stats: PushdownStats,
}

impl<F> Iterator for PushdownIterator<F>
where
    F: PushdownFilter<VcfRecord>,
{
    type Item = Result<VcfRecord>;

    fn next(&mut self) -> Option<Self::Item> {
        use crate::core::GenomicRecordIterator;
        use std::io::BufRead;

        // Access the internal reader (we'll need to modify VcfReader to expose this)
        // For now, this is a placeholder showing the concept

        // The actual implementation would:
        // 1. Read line
        // 2. Call filter.test_raw(line)
        // 3. If false, skip (pushdown_hits++)
        // 4. If true, parse and call filter.test(record)
        // 5. Track stats

        // This requires modifying VcfReader to expose its internal reader
        // or adding a new constructor that takes ownership

        None // Placeholder
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pushdown_stats() {
        let stats = PushdownStats {
            total_lines: 1000,
            pushdown_hits: 900,
            full_parses: 100,
            filter_passed: 50,
        };

        assert_eq!(stats.speedup(), 10.0); // 1000/100 = 10x speedup
    }
}