smdiff_writer/
lib.rs

1//! This lib is used to *construct* valid SMDIFF format delta files.
2//! This is *not* an encoder.
3//! However, if you did write an encoder this would help you write the ops to a file.
4use smdiff_common::{diff_addresses_to_i64, size_routine, write_i_varint, write_u16, write_u8, write_u_varint, AddOp, Copy, CopySrc, Format, Op, SectionHeader, Size, MAX_INST_SIZE, MAX_WIN_SIZE, SECTION_COMPRESSION_RSHIFT, SECTION_CONTINUE_BIT, SECTION_FORMAT_BIT, SIZE_MASK};
5
6
7/// Used to write the header to the section.
8/// * `header` - The header to write.
9/// * `writer` - The writer to write to.
10pub fn write_section_header<W: std::io::Write>(header: &SectionHeader, writer:&mut W) -> std::io::Result<()> {
11    let mut cntl_byte = header.compression_algo << SECTION_COMPRESSION_RSHIFT;
12    if let Format::Segregated = header.format {
13        cntl_byte |= SECTION_FORMAT_BIT;  // Set format bit
14    }
15    if header.more_sections{
16        cntl_byte |= SECTION_CONTINUE_BIT; // Set continuation bit
17    }
18    write_u8(writer, cntl_byte)?;
19    write_u_varint(writer, header.num_operations as u64)?;
20    let output_size = if header.format == Format::Segregated {
21        write_u_varint(writer, header.num_add_bytes as u64)?;
22        header.output_size - header.num_add_bytes
23    } else {
24        header.output_size
25    };
26    write_u_varint(writer, output_size as u64)?;
27    Ok(())
28}
29
30/// Used to write just the ops for the section.
31/// * `ops` - The operations to write.
32/// * `header` - The header for the section. This must match the contents of the ops.
33/// * `writer` - The writer to write to.
34pub fn write_ops<W: std::io::Write,A:AddOp>(ops: &[Op<A>], header:&SectionHeader, writer: &mut W) -> std::io::Result<()> {
35    if header.output_size as usize > MAX_WIN_SIZE{
36        return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "Output size is greater than MAX_WIN_SIZE"));
37    }
38    if ops.len() != header.num_operations as usize {
39        return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "Number of operations does not match header"));
40    }
41    let mut last_d_addr = 0;
42    let mut last_o_addr = 0;
43    let mut total_content_len = 0;
44    let mut add_bytes_written = 0;
45    let mut add_bytes_slices = Vec::new();
46    for op in ops {
47        total_content_len += op.oal() as usize;
48        write_op_byte_and_size(writer, &op)?;
49        match header.format{
50            Format::Interleaved => write_op_addtl(writer, &op, &mut last_d_addr, &mut last_o_addr)?,
51            Format::Segregated => {
52                match op {
53                    Op::Add(a) => {
54                        let slice = a.bytes();
55                        add_bytes_written += slice.len();
56                        add_bytes_slices.push(slice)
57                    },
58                    a => write_op_addtl(writer, a, &mut last_d_addr, &mut last_o_addr)?,
59                }
60            },
61        }
62    }
63    if header.format == Format::Segregated {
64        for slice in add_bytes_slices {
65            writer.write_all(slice)?;
66        }
67        if add_bytes_written != header.num_add_bytes as usize{
68            return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "Number of add bytes does not match header"));
69        }
70    }
71    if total_content_len as usize != header.output_size as usize {
72        return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, format!("Total content length {} does not match output size {}", total_content_len, header.output_size)));
73    }
74    Ok(())
75}
76
77
78fn write_op_byte_and_size<W: std::io::Write,A:AddOp>(writer: &mut W, op: &Op<A>) -> std::io::Result<()> {
79    let byte = op.bit_flag();
80    let size = size_routine(op.oal());
81    assert!(if op.is_run() { op.oal() <= 62 } else { true });
82    match size {
83        Size::Done(len) => write_u8(writer, byte | len)?,
84        Size::U8And62 => {
85            assert!(!op.is_run());
86            write_u8(writer, byte | SIZE_MASK)?;
87            write_u8(writer, (op.oal() - 62) as u8)?
88        },
89        Size::U16 => {
90            assert!(!op.is_run());
91            write_u8(writer, byte)?;
92            write_u16(writer, op.oal())?;
93        }
94    }
95    Ok(())
96}
97
98fn write_op_addtl<W: std::io::Write,A:AddOp>(writer: &mut W, op: &Op<A>, cur_d_addr: &mut u64, cur_o_addr: &mut u64) -> std::io::Result<()> {
99    match op {
100        Op::Run(run) => write_u8(writer, run.byte)?,
101        Op::Add(add) => writer.write_all(&add.bytes())?,
102        Op::Copy (Copy{ src,addr, .. }) => {
103            // Derive difference based on target or source address
104            match src {
105                CopySrc::Dict => {
106                    let int = diff_addresses_to_i64(*cur_d_addr, *addr);
107                    write_i_varint(writer, int)?;
108                    *cur_d_addr = *addr;
109                },
110                CopySrc::Output =>{
111                    let int = diff_addresses_to_i64(*cur_o_addr, *addr);
112                    write_i_varint(writer, int)?;
113                    *cur_o_addr = *addr;
114                },
115            }
116        }
117    }
118    Ok(())
119}
120
121/// This takes a large list of ops and divides them into sections that are no larger than `max_section_size`
122/// * `ops` - The list of ops to divide into sections.
123/// * `max_section_size` - The maximum size of each section in output bytes.
124/// * Returns a vector of tuples containing the ops for each section and the header for that section.
125pub fn make_sections<A:AddOp>(ops: &[Op<A>], max_section_size: usize) -> Vec<(&[Op<A>],SectionHeader)> {
126    let max_win_size = max_section_size.clamp(MAX_INST_SIZE, MAX_WIN_SIZE) as u32;
127    let mut result = Vec::new();
128    let mut output_size = 0;
129    let mut num_add_bytes = 0;
130    let mut start_index = 0;
131
132    for (end_index, op) in ops.iter().enumerate() {
133        // Check if adding the current op exceeds the window size
134        let op_size = op.oal() as u32;
135        if output_size + op_size > max_win_size {
136            result.push((&ops[start_index..end_index],SectionHeader{ num_operations: (end_index-start_index) as u32, num_add_bytes, output_size, compression_algo: 0, format: Format::Interleaved, more_sections: true }));
137            start_index = end_index;
138            output_size = 0;
139            num_add_bytes = 0;
140        }
141        if op.is_add() {
142            num_add_bytes += op_size;
143        }
144        output_size += op_size;
145    }
146
147    // Add the last group
148    result.push((&ops[start_index..],SectionHeader{ num_operations: (ops.len()-start_index) as u32, num_add_bytes, output_size, compression_algo: 0, format: Format::Interleaved, more_sections: false }));
149
150    result
151}
152
153#[cfg(test)]
154mod test_super {
155    use smdiff_common::Run;
156    struct Add{
157        bytes: Vec<u8>,
158    }
159    impl Add{
160        fn new(bytes: Vec<u8>) -> Self {
161            Self { bytes }
162        }
163    }
164    impl AddOp for Add{
165        fn bytes(&self) -> &[u8] {
166            &self.bytes
167        }
168    }
169    use super::*;
170    #[test]
171    fn test_basic_add_run() {
172        // Setup
173        let ops= vec![
174            Op::Add(Add::new("he".as_bytes().to_vec())),
175            Op::Run(Run { byte: b'l', len: 2 }),
176            Op::Add(Add::new("o".as_bytes().to_vec())),
177        ];
178        let header = SectionHeader { compression_algo: 0, format: Format::Interleaved , num_operations: 3 , num_add_bytes: 3, output_size: 5 , more_sections:false};
179        let mut writer = Vec::new();
180        write_section_header(&header, &mut writer).unwrap();
181        write_ops(&ops, &header,&mut writer).unwrap();
182
183        let answer = vec![
184            0, // 0b0_0_000_000
185            3, //num_ops uvarint
186            5, //output size uvarint
187            130, //ADD, Size 2 0b10_000010
188            104, //'h'
189            101, //'e'
190            194, //RUN, Size 2 0b11_000010
191            108, //'l'
192            129, //ADD, Size 1 0b10_000001
193            111 //'o'
194        ];
195        assert_eq!(writer, answer);
196
197    }
198    #[test]
199    fn test_hello_micro() {
200        // Instructions
201        // "hello" -> "Hello! Hello!"
202        let ops= vec![
203            Op::Add(Add::new("H".as_bytes().to_vec())),
204            Op::Copy(Copy { src: CopySrc::Dict, addr: 1, len: 4 }),
205            Op::Add(Add::new("! ".as_bytes().to_vec())),
206            Op::Copy(Copy { src: CopySrc::Output, addr: 0, len: 6 }),
207        ];
208        let header = SectionHeader { compression_algo: 0, format: Format::Interleaved  , num_operations: 4 , num_add_bytes: 3, output_size: 13 , more_sections:false };
209        let mut writer = Vec::new();
210        write_section_header(&header, &mut writer).unwrap();
211        write_ops(&ops, &header,&mut writer).unwrap();
212        let answer = vec![
213            0, // 0b0_0_000_000
214            4, //num_ops uvarint
215            13, //output size uvarint
216            129, //ADD, Size 1 0b10_000001
217            72, //'H'
218            4, //COPY_D, Size 4 0b00_000100
219            2, //addr ivar int +1
220            130, //ADD, Size 2 0b10_000010
221            33, //'!'
222            32, //' '
223            70, //COPY_O, Size 6 0b01_000110
224            0, //addr ivar int 0
225        ];
226        assert_eq!(writer, answer);
227    }
228    #[test]
229    pub fn test_hello_win(){
230        //we need 3 windows, Neither, Src, and Target, in that order.
231        //src will be 'hello' and output will be 'Hello! Hello!'
232        //we encode just the Add(H) in the Neither window
233        //then we encode the COPY(ello) in the Src window
234        //then we encode the Copy(Hello!) in the Target window
235        let mut writer = Vec::new();
236        let win_ops: Vec<Op<Add>>= vec![
237            Op::Add(Add::new("H".as_bytes().to_vec())),
238        ];
239        let header = SectionHeader {
240            compression_algo: 0,
241            format: Format::Segregated ,
242            num_operations: 1 ,
243            num_add_bytes: 1,
244            output_size: 1 ,
245            more_sections:true
246        };
247        write_section_header(&header, &mut writer).unwrap();
248        write_ops(&win_ops, &header,&mut writer).unwrap();
249
250        let win_ops: Vec<Op<Add>>= vec![
251            Op::Copy(Copy { src: CopySrc::Dict, addr: 1, len: 4 }),
252        ];
253        let header = SectionHeader {
254            compression_algo: 0,
255            format: Format::Segregated ,
256            num_operations: 1 ,
257            num_add_bytes: 0,
258            output_size: 4 ,
259            more_sections:true
260        };
261        write_section_header(&header, &mut writer).unwrap();
262        write_ops(&win_ops, &header,&mut writer).unwrap();
263
264        let win_ops: Vec<Op<Add>>= vec![
265            Op::Add(Add::new("! ".as_bytes().to_vec())),
266        ];
267        let header = SectionHeader {
268            compression_algo: 0,
269            format: Format::Segregated ,
270            num_operations: 1 ,
271            num_add_bytes: 2,
272            output_size: 2 ,
273            more_sections:true
274        };
275        write_section_header(&header, &mut writer).unwrap();
276        write_ops(&win_ops, &header,&mut writer).unwrap();
277        let win_ops: Vec<Op<Add>>= vec![
278            Op::Copy(Copy { src: CopySrc::Output, addr: 0, len: 6 }),
279        ];
280        let header = SectionHeader {
281            compression_algo: 0,
282            format: Format::Segregated ,
283            num_operations: 1 ,
284            num_add_bytes: 0,
285            output_size: 6 ,
286            more_sections:false
287        };
288        write_section_header(&header, &mut writer).unwrap();
289        write_ops(&win_ops, &header,&mut writer).unwrap();
290        let answer = vec![
291            192, // 0b1_1_000_000
292            1, //Num ops uvarint
293            1, //Num add bytes uvarint
294            0, //Output size uvarint diff encoded from add uvarint
295            129, //ADD, Size 1 0b10_000001
296            72, //'H'
297
298            192, // 0b1_1_000_000
299            1, //Num ops uvarint
300            0, //Num add bytes uvarint
301            4, //Output size uvarint diff encoded from add uvarint
302            4, //COPY_D, Size 4 0b00_000100
303            2, //addr ivar int +1
304
305            192, // 0b1_1_000_000
306            1, //Num ops uvarint
307            2, //Num add bytes uvarint
308            0, //Output size uvarint diff encoded from add uvarint
309            130, //ADD, Size 2 0b10_000010
310            33, //'!'
311            32, //' '
312
313            64, // 0b0_1_000_000
314            1, //Num ops uvarint
315            0, //Num add bytes uvarint
316            6, //Output size uvarint diff encoded from add uvarint
317            70, //COPY_O, Size 6 0b01_000110
318            0, //addr ivar int 0
319        ];
320
321        assert_eq!(writer, answer);
322
323    }
324
325    #[test]
326    pub fn kitchen_sink_transform(){
327        //we need 3 windows, Neither, Src, and Target, in that order.
328        //src will be 'hello' and output will be 'Hello! Hello! Hell...'
329        //we encode just the Add(H) in the Neither window
330        //then we encode the COPY(ello) in the Src window
331        //then we encode the Copy(Hello!) in the Target window
332        //then we encode the Copy(Hell) in the Target window, referencing the last window
333        //then we encode the Add('.') in the Target window
334        //then we encode an implicit Copy For the last '..' chars.
335        let mut writer = Vec::new();
336        let win_ops: Vec<Op<Add>>= vec![
337            Op::Add(Add::new("H".as_bytes().to_vec())),
338        ];
339        let header = SectionHeader {
340            compression_algo: 0,
341            format: Format::Segregated ,
342            num_operations: 1 ,
343            num_add_bytes: 1,
344            output_size: 1 ,
345            more_sections:true
346        };
347        write_section_header(&header, &mut writer).unwrap();
348        write_ops(&win_ops, &header,&mut writer).unwrap();
349
350        let win_ops: Vec<Op<Add>>= vec![
351            Op::Copy(Copy { src: CopySrc::Dict, addr: 1, len: 4 }),
352        ];
353        let header = SectionHeader {
354            compression_algo: 0,
355            format: Format::Segregated ,
356            num_operations: 1 ,
357            num_add_bytes: 0,
358            output_size: 4 ,
359            more_sections:true
360        };
361        write_section_header(&header, &mut writer).unwrap();
362        write_ops(&win_ops, &header,&mut writer).unwrap();
363
364        let win_ops: Vec<Op<Add>>= vec![
365            Op::Add(Add::new("! ".as_bytes().to_vec())),
366        ];
367        let header = SectionHeader {
368            compression_algo: 0,
369            format: Format::Segregated ,
370            num_operations: 1 ,
371            num_add_bytes: 2,
372            output_size: 2 ,
373            more_sections:true
374        };
375        write_section_header(&header, &mut writer).unwrap();
376        write_ops(&win_ops, &header,&mut writer).unwrap();
377
378        let win_ops: Vec<Op<Add>>= vec![
379            Op::Copy(Copy { src: CopySrc::Output, addr: 0, len: 6 }),
380        ];
381        let header = SectionHeader {
382            compression_algo: 0,
383            format: Format::Segregated ,
384            num_operations: 1 ,
385            num_add_bytes: 0,
386            output_size: 6 ,
387            more_sections:true
388        };
389        write_section_header(&header, &mut writer).unwrap();
390        write_ops(&win_ops, &header,&mut writer).unwrap();
391
392        let win_ops: Vec<Op<Add>>= vec![
393            Op::Copy(Copy { src: CopySrc::Output, addr: 6, len: 4 }),
394        ];
395        let header = SectionHeader {
396            compression_algo: 0,
397            format: Format::Segregated ,
398            num_operations: 1 ,
399            num_add_bytes: 0,
400            output_size: 4 ,
401            more_sections:true
402        };
403        write_section_header(&header, &mut writer).unwrap();
404        write_ops(&win_ops, &header,&mut writer).unwrap();
405
406        let win_ops: Vec<Op<Add>>= vec![
407            Op::Run(Run { byte: b'.', len: 3 }),
408        ];
409        let header = SectionHeader {
410            compression_algo: 0,
411            format: Format::Segregated ,
412            num_operations: 1 ,
413            num_add_bytes: 0,
414            output_size: 3 ,
415            more_sections:false
416        };
417        write_section_header(&header, &mut writer).unwrap();
418        write_ops(&win_ops, &header,&mut writer).unwrap();
419
420
421        //dbg!(&w);
422        let answer = vec![
423            192, // 0b1_1_000_000
424            1, //Num ops uvarint
425            1, //Num add bytes uvarint
426            0, //Output size uvarint diff encoded from add uvarint
427            129, //ADD, Size 1 0b10_000001
428            72, //'H'
429
430            192, // 0b1_1_000_000
431            1, //Num ops uvarint
432            0, //Num add bytes uvarint
433            4, //Output size uvarint diff encoded from add uvarint
434            4, //COPY_D, Size 4 0b00_000100
435            2, //addr ivar int +1
436
437            192, // 0b1_1_000_000
438            1, //Num ops uvarint
439            2, //Num add bytes uvarint
440            0, //Output size uvarint diff encoded from add uvarint
441            130, //ADD, Size 2 0b10_000010
442            33, //'!'
443            32, //' '
444
445            192, // 0b1_1_000_000
446            1, //Num ops uvarint
447            0, //Num add bytes uvarint
448            6, //Output size uvarint diff encoded from add uvarint
449            70, //COPY_O, Size 6 0b01_000110
450            0, //addr ivar int 0
451
452            192, // 0b1_1_000_000
453            1, //Num ops uvarint
454            0, //Num add bytes uvarint
455            4, //Output size uvarint diff encoded from add uvarint
456            68, //COPY_O, Size 4 0b01_000100
457            12, //addr ivar int +6
458
459            64, // 0b0_1_000_000
460            1, //Num ops uvarint
461            0, //Num add bytes uvarint
462            3, //Output size uvarint diff encoded from add uvarint
463            195, //Run, Size 3 0b11_000011
464            46, //'.'
465        ];
466
467        assert_eq!(writer, answer);
468
469
470
471    }
472
473}