gix_diff/blob/
unified_diff.rs

1//! Facilities to produce the unified diff format.
2//!
3//! Originally based on <https://github.com/pascalkuthe/imara-diff/pull/14>.
4
5/// Defines the size of the context printed before and after each change.
6///
7/// Similar to the `-U` option in git diff or gnu-diff. If the context overlaps
8/// with previous or next change, the context gets reduced accordingly.
9#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)]
10pub struct ContextSize {
11    /// Defines the size of the context printed before and after each change.
12    symmetrical: u32,
13}
14
15impl Default for ContextSize {
16    fn default() -> Self {
17        ContextSize::symmetrical(3)
18    }
19}
20
21/// Instantiation
22impl ContextSize {
23    /// Create a symmetrical context with `n` lines before and after a changed hunk.
24    pub fn symmetrical(n: u32) -> Self {
25        ContextSize { symmetrical: n }
26    }
27}
28
29/// Specify where to put a newline.
30#[derive(Debug, Copy, Clone)]
31pub enum NewlineSeparator<'a> {
32    /// Place the given newline separator, like `\n`, after each patch header as well as after each line.
33    /// This is the right choice if tokens don't include newlines.
34    AfterHeaderAndLine(&'a str),
35    /// Place the given newline separator, like `\n`, only after each patch header or if a line doesn't contain a newline.
36    /// This is the right choice if tokens do include newlines.
37    /// Note that diff-tokens *with* newlines may diff strangely at the end of files when lines have been appended,
38    /// as it will make the last line look like it changed just because the whitespace at the end 'changed'.
39    AfterHeaderAndWhenNeeded(&'a str),
40}
41
42/// A utility trait for use in [`UnifiedDiff`](super::UnifiedDiff).
43pub trait ConsumeHunk {
44    /// The item this instance produces after consuming all hunks.
45    type Out;
46
47    /// Consume a single `hunk` in unified diff format, that would be prefixed with `header`.
48    /// Note that all newlines are added.
49    ///
50    /// Note that the [`UnifiedDiff`](super::UnifiedDiff) sink will wrap its output in an [`std::io::Result`].
51    /// After this method returned its first error, it will not be called anymore.
52    ///
53    /// The following is hunk-related information and the same that is used in the `header`.
54    /// * `before_hunk_start` is the 1-based first line of this hunk in the old file.
55    /// * `before_hunk_len` the amount of lines of this hunk in the old file.
56    /// * `after_hunk_start` is the 1-based first line of this hunk in the new file.
57    /// * `after_hunk_len` the amount of lines of this hunk in the new file.
58    fn consume_hunk(
59        &mut self,
60        before_hunk_start: u32,
61        before_hunk_len: u32,
62        after_hunk_start: u32,
63        after_hunk_len: u32,
64        header: &str,
65        hunk: &[u8],
66    ) -> std::io::Result<()>;
67    /// Called after the last hunk is consumed to produce an output.
68    fn finish(self) -> Self::Out;
69}
70
71pub(super) mod _impl {
72    use std::{hash::Hash, io::ErrorKind, ops::Range};
73
74    use bstr::{ByteSlice, ByteVec};
75    use imara_diff::{intern, Sink};
76    use intern::{InternedInput, Interner, Token};
77
78    use super::{ConsumeHunk, ContextSize, NewlineSeparator};
79
80    const CONTEXT: char = ' ';
81    const ADDITION: char = '+';
82    const REMOVAL: char = '-';
83
84    /// A [`Sink`] that creates a textual diff in the format typically output by git or `gnu-diff` if the `-u` option is used,
85    /// and passes it in full to a consumer.
86    pub struct UnifiedDiff<'a, T, D>
87    where
88        T: Hash + Eq + AsRef<[u8]>,
89        D: ConsumeHunk,
90    {
91        before: &'a [Token],
92        after: &'a [Token],
93        interner: &'a Interner<T>,
94
95        /// The 0-based start position in the 'before' tokens for the accumulated hunk for display in the header.
96        before_hunk_start: u32,
97        /// The size of the accumulated 'before' hunk in lines for display in the header.
98        before_hunk_len: u32,
99        /// The 0-based start position in the 'after' tokens for the accumulated hunk for display in the header.
100        after_hunk_start: u32,
101        /// The size of the accumulated 'after' hunk in lines.
102        after_hunk_len: u32,
103        // An index into `before` and the context line to print next,
104        // or `None` if this value was never computed to be the correct starting point for an accumulated hunk.
105        ctx_pos: Option<u32>,
106
107        /// Symmetrical context before and after the changed hunk.
108        ctx_size: u32,
109        newline: NewlineSeparator<'a>,
110
111        buffer: Vec<u8>,
112        header_buf: String,
113        delegate: D,
114
115        err: Option<std::io::Error>,
116    }
117
118    impl<'a, T, D> UnifiedDiff<'a, T, D>
119    where
120        T: Hash + Eq + AsRef<[u8]>,
121        D: ConsumeHunk,
122    {
123        /// Create a new instance to create unified diff using the lines in `input`,
124        /// which also must be used when running the diff algorithm.
125        /// `context_size` is the amount of lines around each hunk which will be passed
126        ///to `consume_hunk`.
127        ///
128        /// `consume_hunk` is called for each hunk in unified-diff format, as created from each line separated by `newline_separator`.
129        pub fn new(
130            input: &'a InternedInput<T>,
131            consume_hunk: D,
132            newline_separator: NewlineSeparator<'a>,
133            context_size: ContextSize,
134        ) -> Self {
135            Self {
136                interner: &input.interner,
137                before: &input.before,
138                after: &input.after,
139
140                before_hunk_start: 0,
141                before_hunk_len: 0,
142                after_hunk_len: 0,
143                after_hunk_start: 0,
144                ctx_pos: None,
145
146                ctx_size: context_size.symmetrical,
147                newline: newline_separator,
148
149                buffer: Vec::with_capacity(8),
150                header_buf: String::new(),
151                delegate: consume_hunk,
152
153                err: None,
154            }
155        }
156
157        fn print_tokens(&mut self, tokens: &[Token], prefix: char) {
158            for &token in tokens {
159                self.buffer.push_char(prefix);
160                let line = &self.interner[token];
161                self.buffer.push_str(line);
162                match self.newline {
163                    NewlineSeparator::AfterHeaderAndLine(nl) => {
164                        self.buffer.push_str(nl);
165                    }
166                    NewlineSeparator::AfterHeaderAndWhenNeeded(nl) => {
167                        if !line.as_ref().ends_with_str(nl) {
168                            self.buffer.push_str(nl);
169                        }
170                    }
171                }
172            }
173        }
174
175        fn flush_accumulated_hunk(&mut self) -> std::io::Result<()> {
176            if self.nothing_to_flush() {
177                return Ok(());
178            }
179
180            let ctx_pos = self.ctx_pos.expect("has been set if we started a hunk");
181            let end = (ctx_pos + self.ctx_size).min(self.before.len() as u32);
182            self.print_context_and_update_pos(ctx_pos..end, end);
183
184            let hunk_start = self.before_hunk_start + 1;
185            let hunk_end = self.after_hunk_start + 1;
186            self.header_buf.clear();
187            std::fmt::Write::write_fmt(
188                &mut self.header_buf,
189                format_args!(
190                    "@@ -{},{} +{},{} @@{nl}",
191                    hunk_start,
192                    self.before_hunk_len,
193                    hunk_end,
194                    self.after_hunk_len,
195                    nl = match self.newline {
196                        NewlineSeparator::AfterHeaderAndLine(nl) | NewlineSeparator::AfterHeaderAndWhenNeeded(nl) => {
197                            nl
198                        }
199                    }
200                ),
201            )
202            .map_err(|err| std::io::Error::new(ErrorKind::Other, err))?;
203            self.delegate.consume_hunk(
204                hunk_start,
205                self.before_hunk_len,
206                hunk_end,
207                self.after_hunk_len,
208                &self.header_buf,
209                &self.buffer,
210            )?;
211
212            self.reset_hunks();
213            Ok(())
214        }
215
216        fn print_context_and_update_pos(&mut self, print: Range<u32>, move_to: u32) {
217            self.print_tokens(&self.before[print.start as usize..print.end as usize], CONTEXT);
218            let len = print.end - print.start;
219            self.ctx_pos = Some(move_to);
220            self.before_hunk_len += len;
221            self.after_hunk_len += len;
222        }
223
224        fn reset_hunks(&mut self) {
225            self.buffer.clear();
226            self.before_hunk_len = 0;
227            self.after_hunk_len = 0;
228        }
229
230        fn nothing_to_flush(&self) -> bool {
231            self.before_hunk_len == 0 && self.after_hunk_len == 0
232        }
233    }
234
235    impl<T, D> Sink for UnifiedDiff<'_, T, D>
236    where
237        T: Hash + Eq + AsRef<[u8]>,
238        D: ConsumeHunk,
239    {
240        type Out = std::io::Result<D::Out>;
241
242        fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
243            if self.err.is_some() {
244                return;
245            }
246            let start_next_hunk = self
247                .ctx_pos
248                .is_some_and(|ctx_pos| before.start - ctx_pos > 2 * self.ctx_size);
249            if start_next_hunk {
250                if let Err(err) = self.flush_accumulated_hunk() {
251                    self.err = Some(err);
252                    return;
253                }
254                let ctx_pos = before.start - self.ctx_size;
255                self.ctx_pos = Some(ctx_pos);
256                self.before_hunk_start = ctx_pos;
257                self.after_hunk_start = after.start - self.ctx_size;
258            }
259            let ctx_pos = match self.ctx_pos {
260                None => {
261                    // TODO: can this be made so the code above does the job?
262                    let ctx_pos = before.start.saturating_sub(self.ctx_size);
263                    self.before_hunk_start = ctx_pos;
264                    self.after_hunk_start = after.start.saturating_sub(self.ctx_size);
265                    ctx_pos
266                }
267                Some(pos) => pos,
268            };
269            self.print_context_and_update_pos(ctx_pos..before.start, before.end);
270            self.before_hunk_len += before.end - before.start;
271            self.after_hunk_len += after.end - after.start;
272
273            self.print_tokens(&self.before[before.start as usize..before.end as usize], REMOVAL);
274            self.print_tokens(&self.after[after.start as usize..after.end as usize], ADDITION);
275        }
276
277        fn finish(mut self) -> Self::Out {
278            if let Err(err) = self.flush_accumulated_hunk() {
279                self.err = Some(err);
280            }
281            if let Some(err) = self.err {
282                return Err(err);
283            }
284            Ok(self.delegate.finish())
285        }
286    }
287
288    /// An implementation that fails if the input isn't UTF-8.
289    impl ConsumeHunk for String {
290        type Out = Self;
291
292        fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
293            self.push_str(header);
294            self.push_str(
295                hunk.to_str()
296                    .map_err(|err| std::io::Error::new(ErrorKind::Other, err))?,
297            );
298            Ok(())
299        }
300
301        fn finish(self) -> Self::Out {
302            self
303        }
304    }
305
306    /// An implementation that writes hunks into a byte buffer.
307    impl ConsumeHunk for Vec<u8> {
308        type Out = Self;
309
310        fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
311            self.push_str(header);
312            self.push_str(hunk);
313            Ok(())
314        }
315
316        fn finish(self) -> Self::Out {
317            self
318        }
319    }
320}