gix_diff/blob/pipeline.rs
1use std::{
2 io::{Read, Write},
3 path::{Path, PathBuf},
4 process::{Command, Stdio},
5};
6
7use bstr::{BStr, ByteSlice};
8use gix_filter::{
9 driver::apply::{Delay, MaybeDelayed},
10 pipeline::convert::{ToGitOutcome, ToWorktreeOutcome},
11};
12use gix_object::tree::EntryKind;
13
14use crate::blob::{Driver, Pipeline, ResourceKind};
15
16/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree.
17#[derive(Clone, Debug, Default)]
18pub struct WorktreeRoots {
19 /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located.
20 pub old_root: Option<PathBuf>,
21 /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located.
22 pub new_root: Option<PathBuf>,
23}
24
25/// Access
26impl WorktreeRoots {
27 /// Return the root path for the given `kind`
28 pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> {
29 match kind {
30 ResourceKind::OldOrSource => self.old_root.as_deref(),
31 ResourceKind::NewOrDestination => self.new_root.as_deref(),
32 }
33 }
34
35 /// Return `true` if all worktree roots are unset.
36 pub fn is_unset(&self) -> bool {
37 self.new_root.is_none() && self.old_root.is_none()
38 }
39}
40
41/// Data as part of an [Outcome].
42#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
43pub enum Data {
44 /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`].
45 Buffer {
46 /// If `true`, a [binary to text filter](Driver::binary_to_text_command) was used to obtain the buffer,
47 /// making it a derived value.
48 ///
49 /// Applications should check for this to avoid treating the buffer content as (original) resource content.
50 is_derived: bool,
51 },
52 /// The size that the binary blob had at the given revision, without having applied filters, as it's either
53 /// considered binary or above the big-file threshold.
54 ///
55 /// In this state, the binary file cannot be diffed.
56 Binary {
57 /// The size of the object prior to performing any filtering or as it was found on disk.
58 ///
59 /// Note that technically, the size isn't always representative of the same 'state' of the
60 /// content, as once it can be the size of the blob in git, and once it's the size of file
61 /// in the worktree.
62 size: u64,
63 },
64}
65
66/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
67#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
68pub struct Outcome {
69 /// If available, an index into the `drivers` field to access more diff-related information of the driver for items
70 /// at the given path, as previously determined by git-attributes.
71 ///
72 /// Note that drivers are queried even if there is no object available.
73 pub driver_index: Option<usize>,
74 /// The data itself, suitable for diffing, and if the object or worktree item is present at all.
75 pub data: Option<Data>,
76}
77
78/// Options for use in a [`Pipeline`].
79#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)]
80pub struct Options {
81 /// The amount of bytes that an object has to reach before being treated as binary.
82 /// These objects will not be queried, nor will their data be processed in any way.
83 /// If `0`, no file is ever considered binary due to their size.
84 ///
85 /// Note that for files stored in `git`, what counts is their stored, decompressed size,
86 /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets
87 /// them
88 pub large_file_threshold_bytes: u64,
89 /// Capabilities of the file system which affect how we read worktree files.
90 pub fs: gix_fs::Capabilities,
91}
92
93/// The specific way to convert a resource.
94#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
95pub enum Mode {
96 /// Always prepare the version of the resource as it would be in the work-tree, and
97 /// apply binary-to-text filters if present.
98 ///
99 /// This is typically free for resources in the worktree, and will apply filters to resources in the
100 /// object database.
101 #[default]
102 ToWorktreeAndBinaryToText,
103 /// Prepare the version of the resource as it would be in the work-tree if
104 /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise.
105 ToGitUnlessBinaryToTextIsPresent,
106 /// Always prepare resources as they are stored in `git`.
107 ///
108 /// This is usually fastest, even though resources in the worktree needed to be converted files.
109 ToGit,
110}
111
112impl Mode {
113 fn to_worktree(self) -> bool {
114 matches!(
115 self,
116 Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText
117 )
118 }
119
120 fn to_git(self) -> bool {
121 matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit)
122 }
123}
124
125///
126pub mod convert_to_diffable {
127 use std::collections::TryReserveError;
128
129 use bstr::BString;
130 use gix_object::tree::EntryKind;
131
132 /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
133 #[derive(Debug, thiserror::Error)]
134 #[allow(missing_docs)]
135 pub enum Error {
136 #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")]
137 InvalidEntryKind { rela_path: BString, actual: EntryKind },
138 #[error("Entry at '{rela_path}' is declared as symlink but symlinks are disabled via core.symlinks")]
139 SymlinkDisabled { rela_path: BString },
140 #[error("Entry at '{rela_path}' could not be read as symbolic link")]
141 ReadLink { rela_path: BString, source: std::io::Error },
142 #[error("Entry at '{rela_path}' could not be opened for reading or read from")]
143 OpenOrRead { rela_path: BString, source: std::io::Error },
144 #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")]
145 StreamCopy { rela_path: BString, source: std::io::Error },
146 #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")]
147 RunTextConvFilter {
148 rela_path: BString,
149 cmd: String,
150 source: std::io::Error,
151 },
152 #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")]
153 CreateTempfile { rela_path: BString, source: std::io::Error },
154 #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")]
155 TextConvFilterFailed {
156 rela_path: BString,
157 cmd: String,
158 stderr: BString,
159 },
160 #[error(transparent)]
161 FindObject(#[from] gix_object::find::existing_object::Error),
162 #[error(transparent)]
163 ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error),
164 #[error(transparent)]
165 ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error),
166 #[error("Memory allocation failed")]
167 OutOfMemory(#[from] TryReserveError),
168 }
169}
170
171/// Lifecycle
172impl Pipeline {
173 /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise
174 /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths.
175 /// `options` are used to further configure the way we act..
176 pub fn new(
177 roots: WorktreeRoots,
178 worktree_filter: gix_filter::Pipeline,
179 mut drivers: Vec<super::Driver>,
180 options: Options,
181 ) -> Self {
182 drivers.sort_by(|a, b| a.name.cmp(&b.name));
183 Pipeline {
184 roots,
185 worktree_filter,
186 drivers,
187 options,
188 attrs: {
189 let mut out = gix_filter::attributes::search::Outcome::default();
190 out.initialize_with_selection(&Default::default(), Some("diff"));
191 out
192 },
193 path: Default::default(),
194 }
195 }
196}
197
198/// Access
199impl Pipeline {
200 /// Return all drivers that this instance was initialized with.
201 ///
202 /// They are sorted by [`name`](Driver::name) to support binary searches.
203 pub fn drivers(&self) -> &[super::Driver] {
204 &self.drivers
205 }
206}
207
208/// Conversion
209impl Pipeline {
210 /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`.
211 /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`]
212 /// contains information on how to use `out`, or if it's filled at all.
213 ///
214 /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is
215 /// a resource in the object database, i.e. has no worktree root available.
216 ///
217 /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case
218 /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`.
219 ///
220 /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode.
221 ///
222 /// Use `convert` to control what kind of the resource will be produced.
223 ///
224 /// ### About Tempfiles
225 ///
226 /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set,
227 /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that
228 /// exactly as it would be present in the worktree if checked out.
229 ///
230 /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with
231 /// a signal handler. If they leak, they would remain in the system's `$TMP` directory.
232 #[allow(clippy::too_many_arguments)]
233 pub fn convert_to_diffable(
234 &mut self,
235 id: &gix_hash::oid,
236 mode: EntryKind,
237 rela_path: &BStr,
238 kind: ResourceKind,
239 attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome),
240 objects: &dyn gix_object::FindObjectOrHeader,
241 convert: Mode,
242 out: &mut Vec<u8>,
243 ) -> Result<Outcome, convert_to_diffable::Error> {
244 let is_symlink = match mode {
245 EntryKind::Link => true,
246 EntryKind::Blob | EntryKind::BlobExecutable => false,
247 _ => {
248 return Err(convert_to_diffable::Error::InvalidEntryKind {
249 rela_path: rela_path.to_owned(),
250 actual: mode,
251 })
252 }
253 };
254
255 out.clear();
256 attributes(rela_path, &mut self.attrs);
257 let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'");
258 let driver_index = attr
259 .assignment
260 .state
261 .as_bstr()
262 .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok());
263 let driver = driver_index.map(|idx| &self.drivers[idx]);
264 let mut is_binary = if let Some(driver) = driver {
265 driver
266 .is_binary
267 .map(|is_binary| is_binary && driver.binary_to_text_command.is_none())
268 } else {
269 attr.assignment.state.is_unset().then_some(true)
270 };
271 match self.roots.by_kind(kind) {
272 Some(root) => {
273 self.path.clear();
274 self.path.push(root);
275 self.path.push(gix_path::from_bstr(rela_path));
276 let data = if is_symlink {
277 if !self.options.fs.symlink {
278 return Err(convert_to_diffable::Error::SymlinkDisabled {
279 rela_path: rela_path.to_owned(),
280 });
281 }
282 let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| {
283 convert_to_diffable::Error::ReadLink {
284 rela_path: rela_path.to_owned(),
285 source: err,
286 }
287 })?;
288 target.map(|target| {
289 out.extend_from_slice(gix_path::into_bstr(target).as_ref());
290 Data::Buffer { is_derived: false }
291 })
292 } else {
293 let need_size_only = is_binary == Some(true);
294 let size_in_bytes = (need_size_only
295 || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0))
296 .then(|| {
297 none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| {
298 convert_to_diffable::Error::OpenOrRead {
299 rela_path: rela_path.to_owned(),
300 source: err,
301 }
302 })
303 })
304 .transpose()?;
305 match size_in_bytes {
306 Some(None) => None, // missing as identified by the size check
307 Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => {
308 Some(Data::Binary { size })
309 }
310 _ => {
311 match driver
312 .filter(|_| convert.to_worktree())
313 .and_then(|d| d.prepare_binary_to_text_cmd(&self.path))
314 {
315 Some(cmd) => {
316 // Avoid letting the driver program fail if it doesn't exist.
317 if self.options.large_file_threshold_bytes == 0
318 && none_if_missing(std::fs::symlink_metadata(&self.path))
319 .map_err(|err| convert_to_diffable::Error::OpenOrRead {
320 rela_path: rela_path.to_owned(),
321 source: err,
322 })?
323 .is_none()
324 {
325 None
326 } else {
327 run_cmd(rela_path, cmd, out)?;
328 Some(Data::Buffer { is_derived: true })
329 }
330 }
331 None => {
332 let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| {
333 convert_to_diffable::Error::OpenOrRead {
334 rela_path: rela_path.to_owned(),
335 source: err,
336 }
337 })?;
338
339 match file {
340 Some(mut file) => {
341 if convert.to_git() {
342 let res = self.worktree_filter.convert_to_git(
343 file,
344 gix_path::from_bstr(rela_path).as_ref(),
345 attributes,
346 &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())),
347 )?;
348
349 match res {
350 ToGitOutcome::Unchanged(mut file) => {
351 file.read_to_end(out).map_err(|err| {
352 convert_to_diffable::Error::OpenOrRead {
353 rela_path: rela_path.to_owned(),
354 source: err,
355 }
356 })?;
357 }
358 ToGitOutcome::Process(mut stream) => {
359 stream.read_to_end(out).map_err(|err| {
360 convert_to_diffable::Error::OpenOrRead {
361 rela_path: rela_path.to_owned(),
362 source: err,
363 }
364 })?;
365 }
366 ToGitOutcome::Buffer(buf) => {
367 out.clear();
368 out.try_reserve(buf.len())?;
369 out.extend_from_slice(buf);
370 }
371 }
372 } else {
373 file.read_to_end(out).map_err(|err| {
374 convert_to_diffable::Error::OpenOrRead {
375 rela_path: rela_path.to_owned(),
376 source: err,
377 }
378 })?;
379 }
380
381 Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) {
382 let size = out.len() as u64;
383 out.clear();
384 Data::Binary { size }
385 } else {
386 Data::Buffer { is_derived: false }
387 })
388 }
389 None => None,
390 }
391 }
392 }
393 }
394 }
395 };
396 Ok(Outcome { driver_index, data })
397 }
398 None => {
399 let data = if id.is_null() {
400 None
401 } else {
402 let header = objects
403 .try_header(id)
404 .map_err(gix_object::find::existing_object::Error::Find)?
405 .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
406 if is_binary.is_none()
407 && self.options.large_file_threshold_bytes > 0
408 && header.size > self.options.large_file_threshold_bytes
409 {
410 is_binary = Some(true);
411 }
412 let data = if is_binary == Some(true) {
413 Data::Binary { size: header.size }
414 } else {
415 objects
416 .try_find(id, out)
417 .map_err(gix_object::find::existing_object::Error::Find)?
418 .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
419 let mut is_derived = false;
420 if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable)
421 && convert == Mode::ToWorktreeAndBinaryToText
422 || (convert == Mode::ToGitUnlessBinaryToTextIsPresent
423 && driver.is_some_and(|d| d.binary_to_text_command.is_some()))
424 {
425 let res =
426 self.worktree_filter
427 .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?;
428
429 let cmd_and_file = driver
430 .and_then(|d| {
431 d.binary_to_text_command.is_some().then(|| {
432 gix_tempfile::new(
433 std::env::temp_dir(),
434 gix_tempfile::ContainingDirectory::Exists,
435 gix_tempfile::AutoRemove::Tempfile,
436 )
437 .and_then(|mut tmp_file| {
438 self.path.clear();
439 tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?;
440 Ok(tmp_file)
441 })
442 .map(|tmp_file| {
443 (
444 d.prepare_binary_to_text_cmd(&self.path)
445 .expect("always get cmd if command is set"),
446 tmp_file,
447 )
448 })
449 })
450 })
451 .transpose()
452 .map_err(|err| convert_to_diffable::Error::CreateTempfile {
453 source: err,
454 rela_path: rela_path.to_owned(),
455 })?;
456 match cmd_and_file {
457 Some((cmd, mut tmp_file)) => {
458 match res {
459 ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => {
460 tmp_file.write_all(buf)
461 }
462 ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
463 std::io::copy(&mut stream, &mut tmp_file).map(|_| ())
464 }
465 ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
466 unreachable!("we prohibit this")
467 }
468 }
469 .map_err(|err| {
470 convert_to_diffable::Error::StreamCopy {
471 source: err,
472 rela_path: rela_path.to_owned(),
473 }
474 })?;
475 out.clear();
476 run_cmd(rela_path, cmd, out)?;
477 is_derived = true;
478 }
479 None => match res {
480 ToWorktreeOutcome::Unchanged(_) => {}
481 ToWorktreeOutcome::Buffer(src) => {
482 out.clear();
483 out.try_reserve(src.len())?;
484 out.extend_from_slice(src);
485 }
486 ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
487 std::io::copy(&mut stream, out).map_err(|err| {
488 convert_to_diffable::Error::StreamCopy {
489 rela_path: rela_path.to_owned(),
490 source: err,
491 }
492 })?;
493 }
494 ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
495 unreachable!("we prohibit this")
496 }
497 },
498 }
499 }
500
501 if driver.is_none_or(|d| d.binary_to_text_command.is_none())
502 && is_binary.unwrap_or_else(|| is_binary_buf(out))
503 {
504 let size = out.len() as u64;
505 out.clear();
506 Data::Binary { size }
507 } else {
508 Data::Buffer { is_derived }
509 }
510 };
511 Some(data)
512 };
513 Ok(Outcome { driver_index, data })
514 }
515 }
516 }
517}
518
519fn is_binary_buf(buf: &[u8]) -> bool {
520 let buf = &buf[..buf.len().min(8000)];
521 buf.contains(&0)
522}
523
524fn none_if_missing<T>(res: std::io::Result<T>) -> std::io::Result<Option<T>> {
525 match res {
526 Ok(data) => Ok(Some(data)),
527 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
528 Err(err) => Err(err),
529 }
530}
531
532fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec<u8>) -> Result<(), convert_to_diffable::Error> {
533 gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command");
534 let mut res = cmd
535 .output()
536 .map_err(|err| convert_to_diffable::Error::RunTextConvFilter {
537 rela_path: rela_path.to_owned(),
538 cmd: format!("{cmd:?}"),
539 source: err,
540 })?;
541 if !res.status.success() {
542 return Err(convert_to_diffable::Error::TextConvFilterFailed {
543 rela_path: rela_path.to_owned(),
544 cmd: format!("{cmd:?}"),
545 stderr: res.stderr.into(),
546 });
547 }
548 out.append(&mut res.stdout);
549 Ok(())
550}
551
552impl Driver {
553 /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`.
554 pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option<std::process::Command> {
555 let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref();
556 let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned())
557 // TODO: Add support for an actual Context, validate it *can* match Git
558 .with_context(Default::default())
559 .with_shell()
560 .stdin(Stdio::null())
561 .stdout(Stdio::piped())
562 .stderr(Stdio::piped())
563 .arg(path)
564 .into();
565 Some(cmd)
566 }
567}