smdiff_merger/
transcoder.rs

1use std::io::{Read, Seek, Write};
2
3use smdiff_common::{CopySrc, MAX_INST_SIZE, MAX_RUN_LEN};
4use smdiff_reader::{Add, Op};
5
6use crate::extract_patch_instructions;
7
8enum InnerOp{
9    Add(Vec<u8>),
10    Copy(smdiff_common::Copy), //we can't do anything with copy operations.
11    Run{byte: u8, len: usize, output_start_pos: usize},
12}
13
14/// Transcodes a patch from one format to another.
15/// This function also attempts to optimize the patch operations while it is transcoding. The routine is as follows:
16/// * It groups adjacent Add operations together, and joins adjacent Run operations.
17/// * It then makes sure that Add operations are no larger than the maximum instruction size.
18/// * The Run operations are also optimized to be no larger than the maximum run length.
19/// * If a Run is long enough, it this fn will encode them using progressively larger Copy operations, until the whole run length is covered.
20///
21///
22/// # Arguments
23/// * `input` - The input patch to transcode.
24/// * `output` - The writer to write the transcoded patch to.
25/// * `format` - The format to transcode the patch to.
26/// * `sec_comp` - The secondary compression to use, if any.
27/// * `output_segment_size` - The size of the output segments.
28///
29/// # Errors
30/// Returns an error if there was an issue reading from the input or writing to the output.
31/// Can also error if there are any invalid operations in the input patch.
32pub fn transcode<R,W>(
33    input: &mut R,
34    output: &mut W,
35    format: smdiff_common::Format,
36    sec_comp: Option<smdiff_encoder::SecondaryCompression>,
37    output_segment_size: usize,
38) -> std::io::Result<()>
39where
40    R: Read+Seek,
41    W: Write,
42{
43    let ops = optimize_and_convert_ops(read_ops_from_patch(input)?);
44    let mut win_data = Vec::new();
45    for (seg_ops, mut header) in crate::make_sections(&ops, output_segment_size) {
46        header.format = format;
47        smdiff_encoder::writer::section_writer(
48            &sec_comp,
49            header,
50            output,
51            &seg_ops,
52            &mut win_data,
53        )?;
54    }
55
56    Ok(())
57}
58
59fn read_ops_from_patch<R:Read+Seek>(input: &mut R) -> std::io::Result<Vec<InnerOp>> {
60    let inners:Vec<InnerOp> = extract_patch_instructions(input)?.0.into_iter().map(|(out_addr,op)|
61        match op {
62            Op::Add(a) => InnerOp::Add(a.bytes.to_vec()),
63            Op::Copy(c) => InnerOp::Copy(c),
64            Op::Run(r) => InnerOp::Run{byte: r.byte, len: r.len as usize, output_start_pos: out_addr as usize},
65        }
66    ).collect();
67    Ok(inners)
68}
69
70fn optimize_and_convert_ops(mut ops: Vec<InnerOp>)->Vec<Op> {
71    join_adjacent_adds(&mut ops);
72    join_adjacent_runs(&mut ops);
73    let mut out_ops = Vec::with_capacity(ops.len());
74    for iop in ops {
75        match iop {
76            InnerOp::Add(bytes) if !bytes.is_empty() => make_add_ops(bytes, &mut out_ops),
77            InnerOp::Copy(copy) => out_ops.push(Op::Copy(copy)),
78            InnerOp::Run{byte, len,output_start_pos} if len>0 => make_run_ops(byte, len, output_start_pos,&mut out_ops),
79            _ => ()
80        }
81    }
82    out_ops
83}
84
85fn join_adjacent_adds(ops: &mut Vec<InnerOp>) {
86    let mut i = 0;
87    while i < ops.len() - 1 {
88        let (left, right) = ops.split_at_mut(i + 1);
89        if let (Some(InnerOp::Add(first)), Some(InnerOp::Add(second))) = (left.last_mut(), right.first_mut()) {
90            if !second.is_empty() && first.len() < MAX_INST_SIZE as usize {
91                first.append(second);
92            }
93            i += 1;
94        }
95        i += 1;
96    }
97}
98
99fn join_adjacent_runs(ops: &mut Vec<InnerOp>) {
100    let mut i = 0;
101    while i < ops.len() - 1 {
102        let (left, right) = ops.split_at_mut(i + 1);
103        if let (Some(InnerOp::Run { byte: byte1, len: len1,.. }), Some(InnerOp::Run { byte: byte2, len: len2, .. })) = (left.last_mut(), right.first_mut()) {
104            if byte1 == byte2 {
105                *len1 += *len2;
106                *len2 = 0;
107            }
108            i += 1;
109        }
110        i += 1;
111    }
112}
113
114pub fn make_add_ops<'a>(bytes: Vec<u8>, output: &mut Vec<Op>){
115    let total_len = bytes.len();
116    if total_len == 0{
117        return;
118    }else if total_len == 1{//emit a run of len 1
119        output.push(Op::Run(smdiff_common::Run{len: 1, byte: bytes[0]}));
120        return;
121    }else if total_len == MAX_INST_SIZE as usize{ //no op if this is already the max size
122        output.push(Op::Add(Add{bytes}));
123        return;
124    }//else we are 2..MAX_INST_SIZE || MAX_INST_SIZE+1..?
125    let mut processed = 0;
126    loop{
127        if processed == total_len{
128            break;
129        }
130        let to_add = total_len - processed;
131        let chunk_size = to_add.min(MAX_INST_SIZE as usize);
132        let op = Add{bytes: bytes[processed..processed+chunk_size].to_vec()};
133        processed += chunk_size;
134        output.push(Op::Add(op));
135    }
136}
137
138// This is directly lifted from the smdiff-encoder crate, so check there for more details/tests
139const RUN_LIMIT: usize = (MAX_RUN_LEN as usize) * 6;
140const COPY_LIMIT: usize = RUN_LIMIT/2;
141fn make_run_ops(byte:u8, len:usize, run_start_pos:usize, output: &mut Vec<Op>){
142    if len < RUN_LIMIT {
143        let mut processed = 0;
144        while processed < len {
145            let remaining = len - processed;
146            let chunk_size = (MAX_RUN_LEN as usize).min(remaining);
147            let op = Op::Run(smdiff_common::Run{byte, len: chunk_size as u8});
148            output.push(op);
149            processed += chunk_size;
150        };
151    }else{
152        //we can use one or more copies on 3 runs.
153        //we need to emit the three runs, then make the copies from the stack
154        output.extend(std::iter::repeat_with(|| Op::Run(smdiff_common::Run{byte, len: MAX_RUN_LEN})).take(3));
155
156        let copy_bytes = len - COPY_LIMIT;
157        let mut processed = 0;
158        let mut max_copy_size = COPY_LIMIT;
159        while processed < copy_bytes{
160            let copy_size = max_copy_size.min(copy_bytes - processed).min(MAX_INST_SIZE);
161            let op = Op::Copy(smdiff_common::Copy{src: CopySrc::Output, addr: run_start_pos as u64, len: copy_size as u16});
162            output.push(op);
163            processed += copy_size;
164            max_copy_size += copy_size;
165        }
166    }
167}
168
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173    use smdiff_common::{Copy, CopySrc, Run, Format};
174    use smdiff_encoder::TrgtMatcherConfig;
175    use smdiff_reader::{Add, Op};
176    use std::io::Cursor;
177
178
179    fn make_ops() -> Vec<Op> {
180        [
181            Op::Add(Add{bytes: b"ABC".to_vec()}),
182            Op::Add(Add{bytes: b"DE".to_vec()}),
183            Op::Run(Run{byte: b'X', len: 3}),
184            Op::Run(Run{ byte: b'X', len: 2}),
185            Op::Copy(Copy { src: CopySrc::Dict, addr: 0, len: 3 }),
186        ].to_vec()
187    }
188    fn make_correct_ops() -> Vec<Op> {
189        [
190            Op::Add(Add{bytes: b"ABCDE".to_vec()}),
191            Op::Run(Run{byte: b'X', len: 5}),
192            Op::Copy(Copy { src: CopySrc::Dict, addr: 0, len: 3 }),
193        ].to_vec()
194    }
195
196    fn create_test_patch() -> Cursor<Vec<u8>> {
197        let mut sink = Cursor::new(Vec::new());
198        for (ops, mut header) in crate::make_sections(&make_ops(), smdiff_common::MAX_WIN_SIZE) {
199            header.format = Format::Segregated;
200            smdiff_encoder::writer::section_writer(&Some(smdiff_encoder::SecondaryCompression::Smdiff(TrgtMatcherConfig::default())), header, &mut sink, ops, &mut Vec::new()).unwrap();
201        }
202        sink.rewind().unwrap();
203        sink
204    }
205
206    #[test]
207    fn test_optimize_and_convert_ops() {
208        let inner_ops = vec![
209            InnerOp::Add(b"ABC".to_vec()),
210            InnerOp::Add(b"DE".to_vec()),
211            InnerOp::Run { byte: b'X', len: 3, output_start_pos: 5 },
212            InnerOp::Run { byte: b'X', len: 2, output_start_pos: 8 },
213            InnerOp::Copy(Copy { src: CopySrc::Dict, addr: 0, len: 3 }),
214        ];
215
216        let optimized_ops = optimize_and_convert_ops(inner_ops);
217
218        assert_eq!(optimized_ops.len(), 3);
219        match &optimized_ops[0] {
220            Op::Add(add) => assert_eq!(add.bytes, b"ABCDE"),
221            _ => panic!("Expected Add op"),
222        }
223        match &optimized_ops[1] {
224            Op::Run(run) => {
225                assert_eq!(run.byte, b'X');
226                assert_eq!(run.len, 5);
227            },
228            _ => panic!("Expected Run op"),
229        }
230    }
231
232    #[test]
233    fn test_transcode() {
234        let mut input = create_test_patch();
235        let mut output = Cursor::new(Vec::new());
236
237        transcode( //transcode from Segregated to Interleaved and sec_comp from smdiff to zstd
238            &mut input,
239            &mut output,
240            Format::Interleaved,
241            Some(smdiff_encoder::SecondaryCompression::Zstd { level: 3 }),
242            smdiff_common::MAX_WIN_SIZE,
243        ).unwrap();
244
245        output.rewind().unwrap();
246        let transcoded_ops = smdiff_decoder::reader::SectionIterator::new(output)
247            .next()
248            .unwrap()
249            .unwrap()
250            .0;
251
252        // Add assertions to verify the transcoded output
253        assert_eq!(transcoded_ops,make_correct_ops()); // Expected number of ops after optimization
254        // Add more specific assertions based on expected optimizations
255    }
256}