aisle 0.2.0

Metadata-driven Parquet pruning for Rust: Skip irrelevant data before reading
Documentation
use std::collections::HashMap;

use arrow_schema::Schema;
use parquet::{
    bloom_filter::Sbbf, file::metadata::ParquetMetaData, schema::types::SchemaDescriptor,
};

pub(crate) struct RowGroupContext<'a> {
    pub(crate) metadata: &'a ParquetMetaData,
    pub(crate) schema: &'a Schema,
    pub(crate) column_lookup: &'a HashMap<String, usize>,
    pub(crate) row_group_idx: usize,
    pub(crate) bloom_filters: Option<HashMap<usize, Sbbf>>,
    pub(crate) options: &'a super::options::PruneOptions,
}

impl RowGroupContext<'_> {
    pub(crate) fn bloom_for_column(&self, column: &str) -> Option<&Sbbf> {
        let col_idx = *self.column_lookup.get(column)?;
        self.bloom_filters.as_ref()?.get(&col_idx)
    }
}

pub(crate) fn build_column_lookup(schema: &SchemaDescriptor) -> HashMap<String, usize> {
    let mut name_counts: HashMap<String, usize> = HashMap::new();
    for column in schema.columns() {
        *name_counts.entry(column.name().to_string()).or_insert(0) += 1;
    }

    let mut lookup = HashMap::new();
    for (idx, column) in schema.columns().iter().enumerate() {
        let path = column.path().string();
        lookup.insert(path, idx);
    }
    for (idx, column) in schema.columns().iter().enumerate() {
        let name = column.name();
        if name_counts.get(name).copied().unwrap_or(0) == 1 {
            lookup.entry(name.to_string()).or_insert(idx);
        }
    }
    lookup
}