smdiff_reader/
lib.rs

1//!
2//!This library is used to read the underlying smdiff format. It does not handle secondary decompression.
3//!
4//!If you need a reader for secondary decompression, you can use the smdiff-decoder::reader module. It wraps this lib.
5//!
6//!The main struct is the `SectionReader`. It reads a section at a time, and returns the ops and the header.
7//!
8//!The building blocks of that reader are exposed for other users to build their own readers.
9//!
10use std::io::Read;
11
12use smdiff_common::{diff_addresses_to_u64, read_i_varint, read_u16, read_u8, read_u_varint, size_routine, AddOp, Copy, CopySrc, Format, Run, SectionHeader, Size, ADD, COPY_D, COPY_O, OP_MASK, RUN, SECTION_COMPRESSION_MASK, SECTION_COMPRESSION_RSHIFT, SECTION_CONTINUE_BIT, SECTION_FORMAT_BIT, SIZE_MASK};
13
14
15/// Op Type alias for the Readers Add type
16pub type Op = smdiff_common::Op<Add>;
17
18/// Add Op for the Reader
19#[derive(Clone, Debug, PartialEq, Eq)]
20pub struct Add{
21    pub bytes: Vec<u8>,
22}
23impl Add{
24    pub fn new(bytes: Vec<u8>) -> Self {
25        Add { bytes }
26    }
27}
28
29impl AddOp for Add{
30    fn bytes(&self) -> &[u8] {
31        &self.bytes
32    }
33}
34/// Reads a section header from the reader at the current position.
35pub fn read_section_header<R: std::io::Read>(reader: &mut R) -> std::io::Result<SectionHeader> {
36    let header_byte = read_u8(reader)?;
37    let compression_algo = (header_byte & SECTION_COMPRESSION_MASK) >> SECTION_COMPRESSION_RSHIFT;
38    let format = if header_byte & SECTION_FORMAT_BIT == SECTION_FORMAT_BIT{Format::Segregated} else {Format::Interleaved};
39    let more_sections = (header_byte & SECTION_CONTINUE_BIT) == SECTION_CONTINUE_BIT;
40    let num_operations = read_u_varint(reader)? as u32;
41    let num_add_bytes = if format.is_segregated() {
42        read_u_varint(reader)? as u32
43    } else {
44        0
45    };
46    let read_size = read_u_varint(reader)? as u32;
47    let output_size = if format.is_segregated(){
48        num_add_bytes + read_size
49    }else{
50        read_size
51    };
52
53    Ok(SectionHeader {
54        compression_algo,
55        format,
56        more_sections,
57        num_operations,
58        num_add_bytes,
59        output_size,
60    })
61}
62
63/// Reads the operations from the reader at the current position. Cannot have secondary compression still applied.
64///
65/// The mutable reference to the section header is so that the
66/// function can update the number of add bytes in the event the format is interleaved.
67/// This way the header reflects reality regardless of if it was originally encoded in the header.
68pub fn read_ops_no_comp<R: std::io::Read>(reader: &mut R, header:&mut SectionHeader,op_buffer:&mut Vec<Op>)-> std::io::Result<()>{
69    let SectionHeader { format, num_operations, output_size, .. } = header;
70    //dbg!(&header);
71    let mut cur_d_addr = 0;
72    let mut cur_o_addr = 0;
73    op_buffer.reserve(*num_operations as usize);
74    match format {
75        Format::Segregated => {
76            let buffer_offset = op_buffer.len();
77            let mut add_idxs = Vec::new();
78            let mut check_size = 0;
79            //dbg!(num_operations,output_size,num_add_bytes);
80            for i in 0..*num_operations {
81                let op = read_op(reader, &mut cur_d_addr, &mut cur_o_addr,false)?;
82                let len = op.oal() as u32;
83                check_size += len;
84                if op.is_add(){
85                    header.num_add_bytes += len;
86                    add_idxs.push(buffer_offset+i as usize);
87                }
88                op_buffer.push(op);
89            }
90            if &check_size != output_size{
91                return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, format!("Window Header output size: {} != Sum(ops.oal()) {}",output_size,check_size)));
92            }
93            //reader should be at the end of the instructions
94            //now we go back and fill the add op buffers
95            for i in add_idxs{
96                let op = op_buffer.get_mut(i).unwrap();
97                if let Op::Add(add) = op{
98                    reader.read_exact(&mut add.bytes)?;
99                }
100            }
101            Ok(())
102        },
103        Format::Interleaved => {
104            let mut check_size = 0;
105            for _ in 0..*num_operations {
106                let op = read_op(reader, &mut cur_d_addr, &mut cur_o_addr,true)?;
107                check_size += op.oal() as u32;
108                op_buffer.push(op);
109            }
110            if &check_size != output_size{
111                return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, format!("Window Header output size: {} != Sum(ops.oal()) {}",output_size,check_size)));
112            }
113            Ok(())
114        }
115    }
116}
117
118///Returns the ops and the output size. Cannot have secondary compression still applied.
119///
120/// This is just a wrapper that completely reads a section from the reader.
121pub fn read_section<R: std::io::Read>(reader: &mut R, op_buffer:&mut Vec<Op>) -> std::io::Result<SectionHeader> {
122    let mut header = read_section_header(reader)?;
123    op_buffer.reserve(header.num_operations as usize);
124    read_ops_no_comp(reader, &mut header, op_buffer)?;
125    Ok(header)
126}
127
128#[derive(Copy, Clone, Debug, PartialEq, Eq)]
129enum OpType{
130    Copy{src:CopySrc},
131    Add,
132    Run
133}
134
135struct OpByte{
136    op:OpType,
137    size:Size
138}
139fn read_op_byte<R: std::io::Read>(reader: &mut R) -> std::io::Result<OpByte> {
140    let byte = read_u8(reader)?;
141    let size_indicator = byte & SIZE_MASK;
142    let op_type = byte & OP_MASK;
143
144    let size = size_routine(size_indicator as u16);
145    match op_type {
146        COPY_D => Ok(OpByte{op:OpType::Copy { src: CopySrc::Dict },size}),
147        COPY_O => Ok(OpByte{op:OpType::Copy { src: CopySrc::Output },size}),
148        ADD => Ok(OpByte{op:OpType::Add,size}),
149        RUN => Ok(OpByte{op:OpType::Run,size}),
150        _ => Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "Invalid op type")),
151    }
152}
153/// Reads an operation from the reader at the given position.
154/// * `reader` - The reader to read from.
155/// * `cur_d_addr` - The last used copy dictionary address.
156/// * `cur_o_addr` - The last used copy output address.
157/// * `is_interleaved` - If the format is interleaved.
158///
159/// If this is segregated format, the Add ops will just be initialized to all zeros in the bytes field.
160/// The caller will need to fill in the bytes later.
161pub fn read_op<R: std::io::Read>(reader: &mut R,cur_d_addr:&mut u64,cur_o_addr:&mut u64,is_interleaved:bool) -> std::io::Result<Op> {
162    let OpByte { op, size } = read_op_byte(reader)?;
163    if matches!(op, OpType::Run) && !matches!(size, Size::Done(_)) {
164        return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "Invalid size for RUN operation"));
165    }
166    let size = match size {
167        Size::Done(size) => size as u16,
168        Size::U8And62 => read_u8(reader)? as u16 + 62,
169        Size::U16 => read_u16(reader)?,
170    };
171    let op = match op {
172        OpType::Copy { src } => {
173            let addr = read_i_varint(reader)?;
174            let len = size;
175            let addr = if src == CopySrc::Dict {
176                *cur_d_addr = diff_addresses_to_u64(*cur_d_addr, addr);
177                *cur_d_addr
178            } else {
179                *cur_o_addr = diff_addresses_to_u64(*cur_o_addr, addr);
180                *cur_o_addr
181            };
182            Op::Copy(Copy{src,addr,len})
183        },
184        OpType::Add => {
185            let mut bytes = vec![0u8;size as usize];
186            if is_interleaved{
187                reader.read_exact(&mut bytes)?;
188            }
189            Op::Add(Add{bytes})
190        },
191        OpType::Run => {
192            Op::Run(Run{len:size as u8,byte:read_u8(reader)?})
193        }
194    };
195    Ok(op)
196}
197
198/// A reader that will keep reading sections until it reaches the terminal section.
199pub struct SectionIterator<R>{
200    source: R,
201    done:bool,
202    op_buffer: Vec<Op>,
203}
204impl<R: Read> SectionIterator<R>{
205    pub fn new(source: R) -> Self {
206        Self {
207            source,
208            done:false,
209            op_buffer: Vec::new(),
210        }
211    }
212    ///Reads and returns the next section (if it exists).
213    ///
214    /// This is useful if you don't need the Ops, just need to read them.
215    pub fn next_borrowed(&mut self) -> Option<std::io::Result<(&[Op],SectionHeader)>> {
216        if self.done{
217            return None;
218        }
219        self.op_buffer.clear();
220        let header = match read_section(&mut self.source,&mut self.op_buffer){
221            Ok(v) => v,
222            Err(e) => return Some(Err(e)),
223
224        };
225        if !header.more_sections{
226            self.done = true;
227        }
228        Some(Ok((self.op_buffer.as_slice(),header)))
229    }
230    ///In the event the caller needs to do something to the ops (more than just read them), this avoids the need to clone the slice.
231    fn next_owned(&mut self) -> Option<std::io::Result<(Vec<Op>,SectionHeader)>> {
232        if self.done{
233            return None;
234        }
235        let mut op_buffer = Vec::new();
236        let header = match read_section(&mut self.source,&mut op_buffer){
237            Ok(v) => v,
238            Err(e) => return Some(Err(e)),
239
240        };
241        if !header.more_sections{
242            self.done = true;
243        }
244        Some(Ok((op_buffer,header)))
245    }
246    pub fn into_inner(self) -> R {
247        self.source
248    }
249}
250impl<R: Read> Iterator for SectionIterator<R> {
251    type Item = std::io::Result<(Vec<Op>, SectionHeader)>;
252
253    fn next(&mut self) -> Option<Self::Item> {
254        self.next_owned()
255    }
256}
257#[cfg(test)]
258mod tests {
259    use std::io::Cursor;
260
261    use super::*;
262
263
264    #[test]
265    fn test_basic_add_run() {
266        // Setup
267        let ops= vec![
268            Op::Add(Add::new("he".as_bytes().to_vec())),
269            Op::Run(Run { byte: b'l', len: 2 }),
270            Op::Add(Add::new("o".as_bytes().to_vec())),
271        ];
272        let answer = vec![
273            0, // 0b0_0_000_000
274            3, //num_ops uvarint
275            5, //output size uvarint
276            130, //ADD, Size 2 0b10_000010
277            104, //'h'
278            101, //'e'
279            194, //RUN, Size 2 0b11_000010
280            108, //'l'
281            129, //ADD, Size 1 0b10_000001
282            111 //'o'
283        ];
284        let mut reader = SectionIterator::new(Cursor::new(answer));
285        while let Some(Ok((read_ops,_))) = reader.next_borrowed(){
286            for (op,answer) in read_ops.iter().zip(ops.clone()) {
287                assert_eq!(op, &answer);
288            }
289        }
290
291    }
292    #[test]
293    fn test_hello_micro() {
294        // Instructions
295        // "hello" -> "Hello! Hello!"
296        let ops= vec![
297            Op::Add(Add::new("H".as_bytes().to_vec())),
298            Op::Copy(Copy { src: CopySrc::Dict, addr: 1, len: 4 }),
299            Op::Add(Add::new("! ".as_bytes().to_vec())),
300            Op::Copy(Copy { src: CopySrc::Output, addr: 0, len: 6 }),
301        ];
302        let answer = vec![
303            0, // 0b0_0_000_000
304            4, //num_ops uvarint
305            13, //output size uvarint
306            129, //ADD, Size 1 0b10_000001
307            72, //'H'
308            4, //COPY_D, Size 4 0b00_000100
309            2, //addr ivar int +1
310            130, //ADD, Size 2 0b10_000010
311            33, //'!'
312            32, //' '
313            70, //COPY_O, Size 6 0b01_000110
314            0, //addr ivar int 0
315        ];
316        let mut reader = SectionIterator::new(Cursor::new(answer));
317        while let Some(Ok((read_ops,_))) = reader.next_borrowed(){
318            for (op,answer) in read_ops.iter().zip(ops.clone()) {
319                assert_eq!(op, &answer);
320            }
321        }
322    }
323    #[test]
324    pub fn test_hello_win(){
325        //we need 3 windows, Neither, Src, and Target, in that order.
326        //src will be 'hello' and output will be 'Hello! Hello!'
327        //we encode just the Add(H) in the Neither window
328        //then we encode the COPY(ello) in the Src window
329        //then we encode the Copy(Hello!) in the Target window
330        let ops = [
331            vec![
332                Op::Add(Add::new("H".as_bytes().to_vec())),
333            ],
334            vec![
335                Op::Copy(Copy { src: CopySrc::Dict, addr: 1, len: 4 }),
336            ],
337            vec![
338                Op::Add(Add::new("! ".as_bytes().to_vec())),
339            ],
340            vec![
341                Op::Copy(Copy { src: CopySrc::Output, addr: 0, len: 6 }),
342            ]
343        ];
344
345        let answer = vec![
346            192, // 0b1_1_000_000
347            1, //Num ops uvarint
348            1, //Num add bytes uvarint
349            0, //Output size uvarint diff encoded from add uvarint
350            129, //ADD, Size 1 0b10_000001
351            72, //'H'
352
353            192, // 0b1_1_000_000
354            1, //Num ops uvarint
355            0, //Num add bytes uvarint
356            4, //Output size uvarint diff encoded from add uvarint
357            4, //COPY_D, Size 4 0b00_000100
358            2, //addr ivar int +1
359
360            192, // 0b1_1_000_000
361            1, //Num ops uvarint
362            2, //Num add bytes uvarint
363            0, //Output size uvarint diff encoded from add uvarint
364            130, //ADD, Size 2 0b10_000010
365            33, //'!'
366            32, //' '
367
368            64, // 0b0_1_000_000
369            1, //Num ops uvarint
370            0, //Num add bytes uvarint
371            6, //Output size uvarint diff encoded from add uvarint
372            70, //COPY_O, Size 6 0b01_000110
373            0, //addr ivar int 0
374        ];
375        let mut reader = SectionIterator::new(Cursor::new(answer));
376        let mut ops_iter = ops.iter();
377        while let Some(Ok((read_ops,_))) = reader.next_borrowed(){
378            let ans_ops = ops_iter.next().unwrap();
379            for (op,answer) in read_ops.iter().zip(ans_ops.clone()) {
380                assert_eq!(op, &answer);
381            }
382        }
383
384    }
385
386    #[test]
387    pub fn kitchen_sink_transform(){
388        //we need 3 windows, Neither, Src, and Target, in that order.
389        //src will be 'hello' and output will be 'Hello! Hello! Hell...'
390        //we encode just the Add(H) in the Neither window
391        //then we encode the COPY(ello) in the Src window
392        //then we encode the Copy(Hello!) in the Target window
393        //then we encode the Copy(Hell) in the Target window, referencing the last window
394        //then we encode the Add('.') in the Target window
395        //then we encode an implicit Copy For the last '..' chars.
396        let ops = [
397            vec![
398                Op::Add(Add::new("H".as_bytes().to_vec())),
399            ],
400            vec![
401                Op::Copy(Copy { src: CopySrc::Dict, addr: 1, len: 4 }),
402            ],
403            vec![
404                Op::Add(Add::new("! ".as_bytes().to_vec())),
405            ],
406            vec![
407                Op::Copy(Copy { src: CopySrc::Output, addr: 0, len: 6 }),
408            ],
409            vec![
410                Op::Copy(Copy { src: CopySrc::Output, addr: 6, len: 5 }),
411            ],
412            vec![
413                Op::Run(Run { byte: b'.', len: 3 }),
414            ],
415        ];
416
417        let answer = vec![
418            192, // 0b1_1_000_000
419            1, //Num ops uvarint
420            1, //Num add bytes uvarint
421            0, //Output size uvarint diff encoded from add uvarint
422            129, //ADD, Size 1 0b10_000001
423            72, //'H'
424
425            192, // 0b1_1_000_000
426            1, //Num ops uvarint
427            0, //Num add bytes uvarint
428            4, //Output size uvarint diff encoded from add uvarint
429            4, //COPY_D, Size 4 0b00_000100
430            2, //addr ivar int +1
431
432            192, // 0b1_1_000_000
433            1, //Num ops uvarint
434            2, //Num add bytes uvarint
435            0, //Output size uvarint diff encoded from add uvarint
436            130, //ADD, Size 2 0b10_000010
437            33, //'!'
438            32, //' '
439
440            192, // 0b1_1_000_000
441            1, //Num ops uvarint
442            0, //Num add bytes uvarint
443            6, //Output size uvarint diff encoded from add uvarint
444            70, //COPY_O, Size 6 0b01_000110
445            0, //addr ivar int 0
446
447            192, // 0b1_1_000_000
448            1, //Num ops uvarint
449            0, //Num add bytes uvarint
450            5, //Output size uvarint diff encoded from add uvarint
451            69, //COPY_O, Size 5 0b01_000100
452            12, //addr ivar int +6
453
454            64, // 0b0_1_000_000
455            1, //Num ops uvarint
456            0, //Num add bytes uvarint
457            3, //Output size uvarint diff encoded from add uvarint
458            195, //Run, Size 3 0b11_000011
459            46, //'.'
460        ];
461
462        let mut reader = SectionIterator::new(Cursor::new(answer));
463        let mut ops_iter = ops.iter();
464        while let Some(Ok((read_ops,_))) = reader.next_borrowed(){
465            let ans_ops = ops_iter.next().unwrap();
466            for (op,answer) in read_ops.iter().zip(ans_ops.clone()) {
467                assert_eq!(op, &answer);
468            }
469        }
470    }
471}
472