Skip to main content

hdf5_reader/filters/
mod.rs

1pub mod deflate;
2pub mod fletcher32;
3#[cfg(feature = "lz4")]
4pub mod lz4;
5pub mod shuffle;
6
7use std::collections::HashMap;
8
9use crate::error::{Error, Result};
10use crate::messages::filter_pipeline::FilterDescription;
11
12/// Standard HDF5 filter IDs.
13pub const FILTER_DEFLATE: u16 = 1;
14pub const FILTER_SHUFFLE: u16 = 2;
15pub const FILTER_FLETCHER32: u16 = 3;
16pub const FILTER_SZIP: u16 = 4;
17pub const FILTER_NBIT: u16 = 5;
18pub const FILTER_SCALEOFFSET: u16 = 6;
19/// HDF5 registered LZ4 filter.
20pub const FILTER_LZ4: u16 = 32004;
21
22/// A user-supplied filter function.
23///
24/// Takes the input data and element size, returns the decoded output.
25pub type FilterFn = Box<dyn Fn(&[u8], usize) -> Result<Vec<u8>> + Send + Sync>;
26
27/// A registry of filter implementations.
28///
29/// Comes pre-loaded with deflate, shuffle, and fletcher32. Users can register
30/// additional filters (e.g., Blosc, LZ4, ZFP) before reading datasets.
31pub struct FilterRegistry {
32    filters: HashMap<u16, FilterFn>,
33}
34
35impl FilterRegistry {
36    /// Create a new registry with the built-in filters pre-registered.
37    pub fn new() -> Self {
38        let mut registry = FilterRegistry {
39            filters: HashMap::new(),
40        };
41        registry.register(
42            FILTER_DEFLATE,
43            Box::new(|data, _| deflate::decompress(data)),
44        );
45        registry.register(
46            FILTER_SHUFFLE,
47            Box::new(|data, elem_size| Ok(shuffle::unshuffle(data, elem_size))),
48        );
49        registry.register(
50            FILTER_FLETCHER32,
51            Box::new(|data, _| fletcher32::verify_and_strip(data)),
52        );
53        #[cfg(feature = "lz4")]
54        registry.register(FILTER_LZ4, Box::new(|data, _| lz4::decompress(data)));
55        registry
56    }
57
58    /// Register a custom filter implementation for the given filter ID.
59    ///
60    /// Overwrites any previously registered filter with the same ID.
61    pub fn register(&mut self, id: u16, f: FilterFn) {
62        self.filters.insert(id, f);
63    }
64
65    /// Apply a single filter by ID.
66    pub fn apply(&self, id: u16, data: &[u8], element_size: usize) -> Result<Vec<u8>> {
67        match self.filters.get(&id) {
68            Some(f) => f(data, element_size),
69            None => Err(Error::UnsupportedFilter(format!("filter id {}", id))),
70        }
71    }
72}
73
74impl Default for FilterRegistry {
75    fn default() -> Self {
76        Self::new()
77    }
78}
79
80/// Apply the filter pipeline in reverse (decompression direction) to a chunk.
81///
82/// HDF5 stores filters in the order they were applied during writing.
83/// On read, we apply them in reverse order.
84///
85/// If `registry` is `None`, the built-in filter set is used.
86///
87/// `filter_mask` is a bitmask where bit N being set means filter N should be skipped.
88pub fn apply_pipeline(
89    data: &[u8],
90    filters: &[FilterDescription],
91    filter_mask: u32,
92    element_size: usize,
93    registry: Option<&FilterRegistry>,
94) -> Result<Vec<u8>> {
95    // Count active filters to decide on single-buffer vs double-buffer strategy.
96    let active_count = filters
97        .iter()
98        .enumerate()
99        .rev()
100        .filter(|(i, _)| filter_mask & (1 << i) == 0)
101        .count();
102
103    if active_count == 0 {
104        return Ok(data.to_vec());
105    }
106
107    // For a single active filter, avoid the double-buffer overhead.
108    if active_count == 1 {
109        for (i, filter) in filters.iter().enumerate().rev() {
110            if filter_mask & (1 << i) != 0 {
111                continue;
112            }
113            return if let Some(reg) = registry {
114                reg.apply(filter.id, data, element_size)
115            } else {
116                apply_builtin_filter(filter, data, element_size)
117            };
118        }
119    }
120
121    // Multi-filter pipeline: the first stage reads from the borrowed input
122    // slice (avoiding a copy), subsequent stages consume the previous output.
123    // Each filter stage necessarily allocates (output sizes are unpredictable),
124    // but we avoid the initial data.to_vec() copy.
125    let mut owned: Option<Vec<u8>> = None;
126
127    for (i, filter) in filters.iter().enumerate().rev() {
128        if filter_mask & (1 << i) != 0 {
129            continue;
130        }
131
132        let input: &[u8] = match &owned {
133            Some(buf) => buf,
134            None => data,
135        };
136
137        owned = Some(if let Some(reg) = registry {
138            reg.apply(filter.id, input, element_size)?
139        } else {
140            apply_builtin_filter(filter, input, element_size)?
141        });
142    }
143
144    Ok(owned.unwrap_or_else(|| data.to_vec()))
145}
146
147fn apply_builtin_filter(
148    filter: &FilterDescription,
149    data: &[u8],
150    element_size: usize,
151) -> Result<Vec<u8>> {
152    match filter.id {
153        FILTER_DEFLATE => deflate::decompress(data),
154        FILTER_SHUFFLE => Ok(shuffle::unshuffle(data, element_size)),
155        FILTER_FLETCHER32 => fletcher32::verify_and_strip(data),
156        FILTER_SZIP => Err(Error::UnsupportedFilter("szip".into())),
157        FILTER_NBIT => Err(Error::UnsupportedFilter("nbit".into())),
158        FILTER_SCALEOFFSET => Err(Error::UnsupportedFilter("scaleoffset".into())),
159        #[cfg(feature = "lz4")]
160        FILTER_LZ4 => lz4::decompress(data),
161        id => Err(Error::UnsupportedFilter(format!("filter id {}", id))),
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn test_filter_registry_default() {
171        let registry = FilterRegistry::new();
172        // Built-in filters should be registered
173        assert!(registry.filters.contains_key(&FILTER_DEFLATE));
174        assert!(registry.filters.contains_key(&FILTER_SHUFFLE));
175        assert!(registry.filters.contains_key(&FILTER_FLETCHER32));
176    }
177
178    #[test]
179    fn test_filter_registry_custom() {
180        let mut registry = FilterRegistry::new();
181        // Register a no-op custom filter
182        registry.register(32000, Box::new(|data, _| Ok(data.to_vec())));
183        let result = registry.apply(32000, &[1, 2, 3], 1).unwrap();
184        assert_eq!(result, vec![1, 2, 3]);
185    }
186
187    #[test]
188    fn test_filter_registry_unknown() {
189        let registry = FilterRegistry::new();
190        let err = registry.apply(9999, &[1, 2, 3], 1).unwrap_err();
191        assert!(matches!(err, Error::UnsupportedFilter(_)));
192    }
193}