gix_diff/blob/
pipeline.rs

1use std::{
2    io::{Read, Write},
3    path::{Path, PathBuf},
4    process::{Command, Stdio},
5};
6
7use bstr::{BStr, ByteSlice};
8use gix_filter::{
9    driver::apply::{Delay, MaybeDelayed},
10    pipeline::convert::{ToGitOutcome, ToWorktreeOutcome},
11};
12use gix_object::tree::EntryKind;
13
14use crate::blob::{Driver, Pipeline, ResourceKind};
15
16/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree.
17#[derive(Clone, Debug, Default)]
18pub struct WorktreeRoots {
19    /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located.
20    pub old_root: Option<PathBuf>,
21    /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located.
22    pub new_root: Option<PathBuf>,
23}
24
25/// Access
26impl WorktreeRoots {
27    /// Return the root path for the given `kind`
28    pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> {
29        match kind {
30            ResourceKind::OldOrSource => self.old_root.as_deref(),
31            ResourceKind::NewOrDestination => self.new_root.as_deref(),
32        }
33    }
34
35    /// Return `true` if all worktree roots are unset.
36    pub fn is_unset(&self) -> bool {
37        self.new_root.is_none() && self.old_root.is_none()
38    }
39}
40
41/// Data as part of an [Outcome].
42#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
43pub enum Data {
44    /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`].
45    Buffer {
46        /// If `true`, a [binary to text filter](Driver::binary_to_text_command) was used to obtain the buffer,
47        /// making it a derived value.
48        ///
49        /// Applications should check for this to avoid treating the buffer content as (original) resource content.
50        is_derived: bool,
51    },
52    /// The size that the binary blob had at the given revision, without having applied filters, as it's either
53    /// considered binary or above the big-file threshold.
54    ///
55    /// In this state, the binary file cannot be diffed.
56    Binary {
57        /// The size of the object prior to performing any filtering or as it was found on disk.
58        ///
59        /// Note that technically, the size isn't always representative of the same 'state' of the
60        /// content, as once it can be the size of the blob in git, and once it's the size of file
61        /// in the worktree.
62        size: u64,
63    },
64}
65
66/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
67#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
68pub struct Outcome {
69    /// If available, an index into the `drivers` field to access more diff-related information of the driver for items
70    /// at the given path, as previously determined by git-attributes.
71    ///
72    /// Note that drivers are queried even if there is no object available.
73    pub driver_index: Option<usize>,
74    /// The data itself, suitable for diffing, and if the object or worktree item is present at all.
75    pub data: Option<Data>,
76}
77
78/// Options for use in a [`Pipeline`].
79#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)]
80pub struct Options {
81    /// The amount of bytes that an object has to reach before being treated as binary.
82    /// These objects will not be queried, nor will their data be processed in any way.
83    /// If `0`, no file is ever considered binary due to their size.
84    ///
85    /// Note that for files stored in `git`, what counts is their stored, decompressed size,
86    /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets
87    /// them
88    pub large_file_threshold_bytes: u64,
89    /// Capabilities of the file system which affect how we read worktree files.
90    pub fs: gix_fs::Capabilities,
91}
92
93/// The specific way to convert a resource.
94#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
95pub enum Mode {
96    /// Always prepare the version of the resource as it would be in the work-tree, and
97    /// apply binary-to-text filters if present.
98    ///
99    /// This is typically free for resources in the worktree, and will apply filters to resources in the
100    /// object database.
101    #[default]
102    ToWorktreeAndBinaryToText,
103    /// Prepare the version of the resource as it would be in the work-tree if
104    /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise.
105    ToGitUnlessBinaryToTextIsPresent,
106    /// Always prepare resources as they are stored in `git`.
107    ///
108    /// This is usually fastest, even though resources in the worktree needed to be converted files.
109    ToGit,
110}
111
112impl Mode {
113    fn to_worktree(self) -> bool {
114        matches!(
115            self,
116            Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText
117        )
118    }
119
120    fn to_git(self) -> bool {
121        matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit)
122    }
123}
124
125///
126pub mod convert_to_diffable {
127    use std::collections::TryReserveError;
128
129    use bstr::BString;
130    use gix_object::tree::EntryKind;
131
132    /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
133    #[derive(Debug, thiserror::Error)]
134    #[allow(missing_docs)]
135    pub enum Error {
136        #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")]
137        InvalidEntryKind { rela_path: BString, actual: EntryKind },
138        #[error("Entry at '{rela_path}' is declared as symlink but symlinks are disabled via core.symlinks")]
139        SymlinkDisabled { rela_path: BString },
140        #[error("Entry at '{rela_path}' could not be read as symbolic link")]
141        ReadLink { rela_path: BString, source: std::io::Error },
142        #[error("Entry at '{rela_path}' could not be opened for reading or read from")]
143        OpenOrRead { rela_path: BString, source: std::io::Error },
144        #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")]
145        StreamCopy { rela_path: BString, source: std::io::Error },
146        #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")]
147        RunTextConvFilter {
148            rela_path: BString,
149            cmd: String,
150            source: std::io::Error,
151        },
152        #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")]
153        CreateTempfile { rela_path: BString, source: std::io::Error },
154        #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")]
155        TextConvFilterFailed {
156            rela_path: BString,
157            cmd: String,
158            stderr: BString,
159        },
160        #[error(transparent)]
161        FindObject(#[from] gix_object::find::existing_object::Error),
162        #[error(transparent)]
163        ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error),
164        #[error(transparent)]
165        ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error),
166        #[error("Memory allocation failed")]
167        OutOfMemory(#[from] TryReserveError),
168    }
169}
170
171/// Lifecycle
172impl Pipeline {
173    /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise
174    /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths.
175    /// `options` are used to further configure the way we act..
176    pub fn new(
177        roots: WorktreeRoots,
178        worktree_filter: gix_filter::Pipeline,
179        mut drivers: Vec<super::Driver>,
180        options: Options,
181    ) -> Self {
182        drivers.sort_by(|a, b| a.name.cmp(&b.name));
183        Pipeline {
184            roots,
185            worktree_filter,
186            drivers,
187            options,
188            attrs: {
189                let mut out = gix_filter::attributes::search::Outcome::default();
190                out.initialize_with_selection(&Default::default(), Some("diff"));
191                out
192            },
193            path: Default::default(),
194        }
195    }
196}
197
198/// Access
199impl Pipeline {
200    /// Return all drivers that this instance was initialized with.
201    ///
202    /// They are sorted by [`name`](Driver::name) to support binary searches.
203    pub fn drivers(&self) -> &[super::Driver] {
204        &self.drivers
205    }
206}
207
208/// Conversion
209impl Pipeline {
210    /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`.
211    /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`]
212    /// contains information on how to use `out`, or if it's filled at all.
213    ///
214    /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is
215    /// a resource in the object database, i.e. has no worktree root available.
216    ///
217    /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case
218    /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`.
219    ///
220    /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode.
221    ///
222    /// Use `convert` to control what kind of the resource will be produced.
223    ///
224    /// ### About Tempfiles
225    ///
226    /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set,
227    /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that
228    /// exactly as it would be present in the worktree if checked out.
229    ///
230    /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with
231    /// a signal handler. If they leak, they would remain in the system's `$TMP` directory.
232    #[allow(clippy::too_many_arguments)]
233    pub fn convert_to_diffable(
234        &mut self,
235        id: &gix_hash::oid,
236        mode: EntryKind,
237        rela_path: &BStr,
238        kind: ResourceKind,
239        attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome),
240        objects: &dyn gix_object::FindObjectOrHeader,
241        convert: Mode,
242        out: &mut Vec<u8>,
243    ) -> Result<Outcome, convert_to_diffable::Error> {
244        let is_symlink = match mode {
245            EntryKind::Link => true,
246            EntryKind::Blob | EntryKind::BlobExecutable => false,
247            _ => {
248                return Err(convert_to_diffable::Error::InvalidEntryKind {
249                    rela_path: rela_path.to_owned(),
250                    actual: mode,
251                })
252            }
253        };
254
255        out.clear();
256        attributes(rela_path, &mut self.attrs);
257        let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'");
258        let driver_index = attr
259            .assignment
260            .state
261            .as_bstr()
262            .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok());
263        let driver = driver_index.map(|idx| &self.drivers[idx]);
264        let mut is_binary = if let Some(driver) = driver {
265            driver
266                .is_binary
267                .map(|is_binary| is_binary && driver.binary_to_text_command.is_none())
268        } else {
269            attr.assignment.state.is_unset().then_some(true)
270        };
271        match self.roots.by_kind(kind) {
272            Some(root) => {
273                self.path.clear();
274                self.path.push(root);
275                self.path.push(gix_path::from_bstr(rela_path));
276                let data = if is_symlink {
277                    if !self.options.fs.symlink {
278                        return Err(convert_to_diffable::Error::SymlinkDisabled {
279                            rela_path: rela_path.to_owned(),
280                        });
281                    }
282                    let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| {
283                        convert_to_diffable::Error::ReadLink {
284                            rela_path: rela_path.to_owned(),
285                            source: err,
286                        }
287                    })?;
288                    target.map(|target| {
289                        out.extend_from_slice(gix_path::into_bstr(target).as_ref());
290                        Data::Buffer { is_derived: false }
291                    })
292                } else {
293                    let need_size_only = is_binary == Some(true);
294                    let size_in_bytes = (need_size_only
295                        || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0))
296                        .then(|| {
297                            none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| {
298                                convert_to_diffable::Error::OpenOrRead {
299                                    rela_path: rela_path.to_owned(),
300                                    source: err,
301                                }
302                            })
303                        })
304                        .transpose()?;
305                    match size_in_bytes {
306                        Some(None) => None, // missing as identified by the size check
307                        Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => {
308                            Some(Data::Binary { size })
309                        }
310                        _ => {
311                            match driver
312                                .filter(|_| convert.to_worktree())
313                                .and_then(|d| d.prepare_binary_to_text_cmd(&self.path))
314                            {
315                                Some(cmd) => {
316                                    // Avoid letting the driver program fail if it doesn't exist.
317                                    if self.options.large_file_threshold_bytes == 0
318                                        && none_if_missing(std::fs::symlink_metadata(&self.path))
319                                            .map_err(|err| convert_to_diffable::Error::OpenOrRead {
320                                                rela_path: rela_path.to_owned(),
321                                                source: err,
322                                            })?
323                                            .is_none()
324                                    {
325                                        None
326                                    } else {
327                                        run_cmd(rela_path, cmd, out)?;
328                                        Some(Data::Buffer { is_derived: true })
329                                    }
330                                }
331                                None => {
332                                    let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| {
333                                        convert_to_diffable::Error::OpenOrRead {
334                                            rela_path: rela_path.to_owned(),
335                                            source: err,
336                                        }
337                                    })?;
338
339                                    match file {
340                                        Some(mut file) => {
341                                            if convert.to_git() {
342                                                let res = self.worktree_filter.convert_to_git(
343                                                    file,
344                                                    gix_path::from_bstr(rela_path).as_ref(),
345                                                    attributes,
346                                                    &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())),
347                                                )?;
348
349                                                match res {
350                                                    ToGitOutcome::Unchanged(mut file) => {
351                                                        file.read_to_end(out).map_err(|err| {
352                                                            convert_to_diffable::Error::OpenOrRead {
353                                                                rela_path: rela_path.to_owned(),
354                                                                source: err,
355                                                            }
356                                                        })?;
357                                                    }
358                                                    ToGitOutcome::Process(mut stream) => {
359                                                        stream.read_to_end(out).map_err(|err| {
360                                                            convert_to_diffable::Error::OpenOrRead {
361                                                                rela_path: rela_path.to_owned(),
362                                                                source: err,
363                                                            }
364                                                        })?;
365                                                    }
366                                                    ToGitOutcome::Buffer(buf) => {
367                                                        out.clear();
368                                                        out.try_reserve(buf.len())?;
369                                                        out.extend_from_slice(buf);
370                                                    }
371                                                }
372                                            } else {
373                                                file.read_to_end(out).map_err(|err| {
374                                                    convert_to_diffable::Error::OpenOrRead {
375                                                        rela_path: rela_path.to_owned(),
376                                                        source: err,
377                                                    }
378                                                })?;
379                                            }
380
381                                            Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) {
382                                                let size = out.len() as u64;
383                                                out.clear();
384                                                Data::Binary { size }
385                                            } else {
386                                                Data::Buffer { is_derived: false }
387                                            })
388                                        }
389                                        None => None,
390                                    }
391                                }
392                            }
393                        }
394                    }
395                };
396                Ok(Outcome { driver_index, data })
397            }
398            None => {
399                let data = if id.is_null() {
400                    None
401                } else {
402                    let header = objects
403                        .try_header(id)
404                        .map_err(gix_object::find::existing_object::Error::Find)?
405                        .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
406                    if is_binary.is_none()
407                        && self.options.large_file_threshold_bytes > 0
408                        && header.size > self.options.large_file_threshold_bytes
409                    {
410                        is_binary = Some(true);
411                    }
412                    let data = if is_binary == Some(true) {
413                        Data::Binary { size: header.size }
414                    } else {
415                        objects
416                            .try_find(id, out)
417                            .map_err(gix_object::find::existing_object::Error::Find)?
418                            .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
419                        let mut is_derived = false;
420                        if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable)
421                            && convert == Mode::ToWorktreeAndBinaryToText
422                            || (convert == Mode::ToGitUnlessBinaryToTextIsPresent
423                                && driver.is_some_and(|d| d.binary_to_text_command.is_some()))
424                        {
425                            let res =
426                                self.worktree_filter
427                                    .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?;
428
429                            let cmd_and_file = driver
430                                .and_then(|d| {
431                                    d.binary_to_text_command.is_some().then(|| {
432                                        gix_tempfile::new(
433                                            std::env::temp_dir(),
434                                            gix_tempfile::ContainingDirectory::Exists,
435                                            gix_tempfile::AutoRemove::Tempfile,
436                                        )
437                                        .and_then(|mut tmp_file| {
438                                            self.path.clear();
439                                            tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?;
440                                            Ok(tmp_file)
441                                        })
442                                        .map(|tmp_file| {
443                                            (
444                                                d.prepare_binary_to_text_cmd(&self.path)
445                                                    .expect("always get cmd if command is set"),
446                                                tmp_file,
447                                            )
448                                        })
449                                    })
450                                })
451                                .transpose()
452                                .map_err(|err| convert_to_diffable::Error::CreateTempfile {
453                                    source: err,
454                                    rela_path: rela_path.to_owned(),
455                                })?;
456                            match cmd_and_file {
457                                Some((cmd, mut tmp_file)) => {
458                                    match res {
459                                        ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => {
460                                            tmp_file.write_all(buf)
461                                        }
462                                        ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
463                                            std::io::copy(&mut stream, &mut tmp_file).map(|_| ())
464                                        }
465                                        ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
466                                            unreachable!("we prohibit this")
467                                        }
468                                    }
469                                    .map_err(|err| {
470                                        convert_to_diffable::Error::StreamCopy {
471                                            source: err,
472                                            rela_path: rela_path.to_owned(),
473                                        }
474                                    })?;
475                                    out.clear();
476                                    run_cmd(rela_path, cmd, out)?;
477                                    is_derived = true;
478                                }
479                                None => match res {
480                                    ToWorktreeOutcome::Unchanged(_) => {}
481                                    ToWorktreeOutcome::Buffer(src) => {
482                                        out.clear();
483                                        out.try_reserve(src.len())?;
484                                        out.extend_from_slice(src);
485                                    }
486                                    ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
487                                        std::io::copy(&mut stream, out).map_err(|err| {
488                                            convert_to_diffable::Error::StreamCopy {
489                                                rela_path: rela_path.to_owned(),
490                                                source: err,
491                                            }
492                                        })?;
493                                    }
494                                    ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
495                                        unreachable!("we prohibit this")
496                                    }
497                                },
498                            }
499                        }
500
501                        if driver.is_none_or(|d| d.binary_to_text_command.is_none())
502                            && is_binary.unwrap_or_else(|| is_binary_buf(out))
503                        {
504                            let size = out.len() as u64;
505                            out.clear();
506                            Data::Binary { size }
507                        } else {
508                            Data::Buffer { is_derived }
509                        }
510                    };
511                    Some(data)
512                };
513                Ok(Outcome { driver_index, data })
514            }
515        }
516    }
517}
518
519fn is_binary_buf(buf: &[u8]) -> bool {
520    let buf = &buf[..buf.len().min(8000)];
521    buf.contains(&0)
522}
523
524fn none_if_missing<T>(res: std::io::Result<T>) -> std::io::Result<Option<T>> {
525    match res {
526        Ok(data) => Ok(Some(data)),
527        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
528        Err(err) => Err(err),
529    }
530}
531
532fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec<u8>) -> Result<(), convert_to_diffable::Error> {
533    gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command");
534    let mut res = cmd
535        .output()
536        .map_err(|err| convert_to_diffable::Error::RunTextConvFilter {
537            rela_path: rela_path.to_owned(),
538            cmd: format!("{cmd:?}"),
539            source: err,
540        })?;
541    if !res.status.success() {
542        return Err(convert_to_diffable::Error::TextConvFilterFailed {
543            rela_path: rela_path.to_owned(),
544            cmd: format!("{cmd:?}"),
545            stderr: res.stderr.into(),
546        });
547    }
548    out.append(&mut res.stdout);
549    Ok(())
550}
551
552impl Driver {
553    /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`.
554    pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option<std::process::Command> {
555        let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref();
556        let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned())
557            // TODO: Add support for an actual Context, validate it *can* match Git
558            .with_context(Default::default())
559            .with_shell()
560            .stdin(Stdio::null())
561            .stdout(Stdio::piped())
562            .stderr(Stdio::piped())
563            .arg(path)
564            .into();
565        Some(cmd)
566    }
567}