idx_decoder/
lib.rs

1//! ## About
2//! An IDX file format decoding library. (Currently WIP)
3//! 
4//! The main type is [`IDXDecoder`]. It implements Iterator whose Item
5//! correspond to items of file format.
6//! 
7//! ### Type parameters
8//! 
9//! [`IDXDecoder`] takes three type parameters.
10//! - `R`: Reader from which data is taken. Can be file, network stream etc.
11//! - `T`: Type of items produced by Iterator. E.g. U8, I16, F32.
12//!   
13//!   All possible types can be found in [`types`](types/index.html) module
14//! - `D`: Type-level integer of dimensions. Must be less than 256.
15//!   
16//!   If it's less than 128 use nalgebra's U* types.
17//!   For value >=128 use typenum's consts.
18//! 
19//! ### Dimensions
20//! 
21//! For one-dimensional decoder returns simply items.
22//! 
23//! For more dimensions, output is a `Vec` of values containing a single item.
24//! 
25//! E.g. a 3-dimensional decoder where items are of size 4x4 will return `Vec`s
26//! of length 16.
27//! 
28//! First dimension of decoder corresponds to amount of items left.
29//! 
30//! ## Caveats
31//! 
32//! Currently decoder only implements Iterator for 1 and 3 dimensions.
33//! It's simply because I didn't implement other.
34//! 
35//! Crate also assumes that items are stored in big endian way, just like sizes.
36//! 
37//! If you found a bug or the crate is missing some functionality,
38//! add an issue or send a pull request.
39//! 
40//! ## Example
41//! ```ignore
42//! let file = std::fs::File::open("data.idx")?;
43//! let decode = idx_decoder::IDXDecoder::<_, idx_decoder::types::U8, nalgebra::U1>::new(file)?;
44//! for item in decode {
45//!     println!("Item: {}", item);
46//! }
47//! ```
48//! 
49//! ## Acknowledgement
50//! This crate is implemented according to file format
51//! found at <http://yann.lecun.com/exdb/mnist/>
52//! 
53//! [`IDXDecoder`]: struct.IDXDecoder.html
54
55use std::{convert::TryInto, io::{self, Read}, marker::PhantomData};
56use nalgebra::{self as na, VectorN, DimName, allocator::Allocator, DefaultAllocator};
57// use typenum::{self as tn, type_operators::IsLess};
58use thiserror::Error;
59
60/// Types used by [`IDXDecoder`](struct.IDXDecoder.html) to specify iterator's output type
61pub mod types {
62    use std::{io::Read, mem::size_of};
63
64    #[doc(hidden)]
65    mod private { pub trait Sealed {} }
66    use private::Sealed;
67
68    /// Trait implemented by output types used by IDXDecoder's iterator
69    /// 
70    /// It can't be implemented outside this crate.
71    pub trait Type: Sealed {
72        const VALUE: u8;
73        type TypeValue;
74    }
75
76    // implemented by types that can be read from reader using big endiann
77    #[doc(hidden)]
78    pub trait BEReadable<R>: Sized {
79        fn read_self(r: &mut R) -> Option<Self>;
80    }
81
82    macro_rules! new_type_int {
83        ( $( $vis:vis $name:ident : $tv:ty = $value:expr,)* ) => {
84            $(
85                $vis struct $name;
86                impl Sealed for $name {}
87                impl Type for $name {
88                    type TypeValue = $tv;
89                    const VALUE: u8 = $value;
90                }
91
92                impl<R: Read> BEReadable<R> for $tv {
93                    fn read_self(r: &mut R) -> Option<Self> {
94                        let mut buf = [0u8; size_of::<Self>()];
95                        r.read_exact(&mut buf).ok()?;
96                        Some(Self::from_be_bytes(buf))
97                    }
98                }
99            )*
100        }
101    }
102
103    macro_rules! new_type_f {
104        ( $( $vis:vis $name:ident : $uint:ty as $tv:ty = $value:expr,)* ) => {
105            $(
106                $vis struct $name;
107                impl Sealed for $name {}
108                impl Type for $name {
109                    type TypeValue = $tv;
110                    const VALUE: u8 = $value;
111                }
112
113                impl<R: Read> BEReadable<R> for $tv {
114                    fn read_self(r: &mut R) -> Option<Self> {
115                        let mut buf = [0u8; size_of::<Self>()];
116                        r.read_exact(&mut buf).ok()?;
117                        Some(Self::from_bits(<$uint>::from_be_bytes(buf)))
118                    }
119                }
120            )*
121        }
122    }
123
124    new_type_int!(
125        pub U8: u8 = 0x08,
126        pub I8: i8 = 0x09,
127        pub I16: i16 = 0x0b,
128        pub I32: i32 = 0x0c,
129    );
130    new_type_f!(
131        pub F32: u32 as f32 = 0x0d,
132        pub F64: u64 as f64 = 0x0e,
133    );
134}
135
136use types::*;
137
138/// The decoder. Check [crate level docs](index.html) for more informations
139pub struct IDXDecoder<R, T: Type, D: DimName>
140where
141    DefaultAllocator: Allocator<u32, D>
142{
143    reader: R,
144    output_type: PhantomData<fn() -> T>,
145    dimensions: VectorN<u32, D>,
146}
147
148/// Error type return by `IDXDecoder::new`
149#[derive(Debug, Error)]
150pub enum IDXError {
151    #[error("Wrong magic, first two bytes should be zero")]
152    WrongMagic,
153    #[error("Wrong type, expected {0}, got {1}")]
154    WrongType(u8, u8),
155    #[error("Wrong number of dimensions, expected {0}, got {1}")]
156    WrongDimensions(u8, u8),
157    #[error("Too many dimensions, must be less than 256")]
158    TooManyDimensons,
159    #[error("{0}")]
160    IOError(#[from] io::Error),
161}
162
163impl<R: Read, T: Type, D: DimName> IDXDecoder<R, T, D>
164where
165    // D: IsLess<tn::consts::U256>,
166    DefaultAllocator: Allocator<u32, D>
167{
168    /// Returns error in case provided types aren't valid 
169    pub fn new(mut reader: R) -> Result<Self, IDXError> {
170        // Read magic and check if it's valid
171        let mut buf = [0u8; 4];
172        reader.read_exact(&mut buf)?;
173        if buf[0] != 0 || buf[1] != 0 { Err(IDXError::WrongMagic)? }
174        if buf[2] != T::VALUE { Err(IDXError::WrongType(T::VALUE, buf[2]))? }
175        let dims: u8 = D::dim().try_into().or(Err(IDXError::TooManyDimensons))?;
176        if buf[3] != dims { Err(IDXError::WrongDimensions(dims, buf[3]))? }
177
178        // Read dimensions
179        // To simplify code we treat amount of items as first dimension
180        let mut dimensions: VectorN<u32, D> = na::zero();
181        for d in dimensions.iter_mut() {
182            let mut buf = [0u8; 4];
183            reader.read_exact(&mut buf)?;
184            *d = u32::from_be_bytes(buf);
185        }
186        Ok(IDXDecoder { reader, output_type: PhantomData, dimensions })
187    }
188
189    /// Size of return values.
190    /// 
191    /// First dimension of decoder corresponds to amount of items left.
192    pub fn dimensions(&self) -> VectorN<u32, D> {
193        self.dimensions.clone()
194    }
195}
196
197impl<R: Read, T: Type> Iterator for IDXDecoder<R, T, na::U1>
198where
199    DefaultAllocator: Allocator<u32, na::U1>,
200    T::TypeValue: BEReadable<R>,
201{
202    type Item = T::TypeValue;
203    fn next(&mut self) -> Option<Self::Item> {
204        if self.dimensions[0] > 0 {
205            self.dimensions[0] -= 1;
206            T::TypeValue::read_self(&mut self.reader)
207        } else {
208            None
209        }
210    }
211}
212
213impl<R: Read, T: Type> Iterator for IDXDecoder<R, T, na::U3>
214where
215    DefaultAllocator: Allocator<u32, na::U3>,
216    T::TypeValue: Default + Clone + BEReadable<R>,
217{
218    type Item = Vec<T::TypeValue>;
219    fn next(&mut self) -> Option<Self::Item> {
220        if self.dimensions[0] > 0 {
221            self.dimensions[0] -= 1;
222            let as_usize = |n: u32| -> Option<usize> { n.try_into().ok() };
223            let len = as_usize(self.dimensions[1])?.checked_mul(as_usize(self.dimensions[2])?)?;
224            let mut items = vec![Default::default(); len];
225            for item in items.iter_mut() {
226                *item = T::TypeValue::read_self(&mut self.reader)?
227            }
228            Some(items)
229        } else {
230            None
231        }
232    }
233
234    fn size_hint(&self) -> (usize, Option<usize>) {
235        (0, self.dimensions[0].try_into().ok())
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use crate::*;
242
243    #[test]
244    fn example_1d() {
245        const DATA: &[u8] = &[
246            // magic, type u8, 1 dim
247            0, 0, 8, 1,
248            // len as big endiann u32
249            0, 0, 0, 3,
250            // items
251            1, 2, 3];
252        let reader = std::io::Cursor::new(DATA);
253        let mut decoder = IDXDecoder::<_, U8, nalgebra::U1>::new(reader)
254            .expect("Decoder creation error");
255        assert_eq!(decoder.next(), Some(1));
256        assert_eq!(decoder.next(), Some(2));
257        assert_eq!(decoder.next(), Some(3));
258        assert_eq!(decoder.next(), None);
259    }
260
261    #[test]
262    fn example_3d() {
263        const DATA: &[u8] = &[
264            // magic, type u8, 1 dim
265            0, 0, 8, 3,
266            // lens as big endiann u32: 3 matrices of 2x2
267            0, 0, 0, 3,
268            0, 0, 0, 2,
269            0, 0, 0, 2,
270            // items
271            1, 2, 3, 4,
272            5, 6, 7, 8,
273            9, 10, 11, 12];
274        let reader = std::io::Cursor::new(DATA);
275        let mut decoder = IDXDecoder::<_, U8, nalgebra::U3>::new(reader)
276            .expect("Decoder creation error");
277        assert_eq!(decoder.next(), Some(vec![1, 2, 3, 4]));
278        assert_eq!(decoder.next(), Some(vec![5, 6, 7, 8]));
279        assert_eq!(decoder.next(), Some(vec![9, 10, 11, 12]));
280        assert_eq!(decoder.next(), None);
281    }
282}