crates_index_diff/index/diff/
mod.rs

1use crate::{Change, Index};
2use bstr::ByteSlice;
3use gix::prelude::ObjectIdExt;
4use gix::traverse::commit::simple::CommitTimeOrder;
5use std::sync::atomic::AtomicBool;
6
7mod delegate;
8mod github;
9
10use delegate::Delegate;
11
12/// The order we maintain for the produced changes.
13#[derive(Debug, Copy, Clone, Eq, PartialEq)]
14pub enum Order {
15    /// Compare provided trees or commits without applying any other logic, with the order being influenced by
16    /// factors like hashmaps.
17    ///
18    /// The benefit is mode is the optimal performance as only one diff is created.
19    ImplementationDefined,
20    /// If the provided revisions are commits, single step through the history that connects them to maintain
21    /// the order in which changes were submitted to the crates-index for all user-defined changes.
22    ///
23    /// Admin changes are still implementation defined, but typically involve only deletions.
24    ///
25    /// The shortcomings of this approach is that each pair of commits has to be diffed individually, increasing
26    /// the amount of work linearly.
27    AsInCratesIndex,
28}
29
30/// The error returned by methods dealing with obtaining index changes.
31#[derive(Debug, thiserror::Error)]
32#[allow(missing_docs)]
33pub enum Error {
34    #[error("Couldn't update marker reference")]
35    ReferenceEdit(#[from] Box<gix::reference::edit::Error>),
36    #[error("Failed to parse rev-spec to determine which revisions to diff")]
37    RevParse(#[from] Box<gix::revision::spec::parse::Error>),
38    #[error(transparent)]
39    DiffRewrites(#[from] Box<gix::diff::new_rewrites::Error>),
40    #[error("Couldn't find blob that showed up when diffing trees")]
41    FindObject(#[from] Box<gix::object::find::existing::Error>),
42    #[error("Couldn't get the tree of a commit for diffing purposes")]
43    PeelToTree(#[from] Box<gix::object::peel::to_kind::Error>),
44    #[error("Failed to diff two trees to find changed crates")]
45    Diff(#[from] Box<gix::diff::options::init::Error>),
46    #[error(transparent)]
47    DiffForEach(#[from] Box<gix::object::tree::diff::for_each::Error>),
48    #[error("Failed to decode {line:?} in file {file_name:?} as crate version")]
49    VersionDecode {
50        source: serde_json::Error,
51        file_name: bstr::BString,
52        line: bstr::BString,
53    },
54    #[error(transparent)]
55    FindRemote(#[from] Box<gix::remote::find::existing::Error>),
56    #[error(transparent)]
57    FindReference(#[from] Box<gix::reference::find::existing::Error>),
58    #[error(transparent)]
59    Connect(#[from] Box<gix::remote::connect::Error>),
60    #[error(transparent)]
61    PrepareFetch(#[from] Box<gix::remote::fetch::prepare::Error>),
62    #[error(transparent)]
63    Fetch(#[from] Box<gix::remote::fetch::Error>),
64    #[error(transparent)]
65    InitAnonymousRemote(#[from] Box<gix::remote::init::Error>),
66    #[error("Could not find local tracking branch for remote branch {name:?} in any of {} fetched refs", mappings.len()
67    )]
68    NoMatchingBranch {
69        name: String,
70        mappings: Vec<gix::remote::fetch::refmap::Mapping>,
71    },
72    #[error("Error when fetching GitHub fastpath.")]
73    GithubFetch(#[from] reqwest::Error),
74}
75
76impl_from_boxed!(gix::diff::new_rewrites::Error => Error::DiffRewrites);
77impl_from_boxed!(gix::diff::options::init::Error => Error::Diff);
78impl_from_boxed!(gix::object::find::existing::Error => Error::FindObject);
79impl_from_boxed!(gix::object::peel::to_kind::Error => Error::PeelToTree);
80impl_from_boxed!(gix::object::tree::diff::for_each::Error => Error::DiffForEach);
81impl_from_boxed!(gix::reference::edit::Error => Error::ReferenceEdit);
82impl_from_boxed!(gix::reference::find::existing::Error => Error::FindReference);
83impl_from_boxed!(gix::remote::connect::Error => Error::Connect);
84impl_from_boxed!(gix::remote::fetch::Error => Error::Fetch);
85impl_from_boxed!(gix::remote::fetch::prepare::Error => Error::PrepareFetch);
86impl_from_boxed!(gix::remote::find::existing::Error => Error::FindRemote);
87impl_from_boxed!(gix::remote::init::Error => Error::InitAnonymousRemote);
88impl_from_boxed!(gix::revision::spec::parse::Error => Error::RevParse);
89
90/// Find changes without modifying the underling repository
91impl Index {
92    /// As `peek_changes_with_options()`, but without the options.
93    pub fn peek_changes(&self) -> Result<(Vec<Change>, gix::hash::ObjectId), Error> {
94        self.peek_changes_with_options(
95            gix::progress::Discard,
96            &AtomicBool::default(),
97            Order::ImplementationDefined,
98        )
99    }
100
101    /// As `peek_changes()` but provides changes similar to those in the crates index.
102    pub fn peek_changes_ordered(&self) -> Result<(Vec<Change>, gix::hash::ObjectId), Error> {
103        self.peek_changes_with_options(
104            gix::progress::Discard,
105            &AtomicBool::default(),
106            Order::AsInCratesIndex,
107        )
108    }
109
110    /// Return all [`Change`]s that are observed between the last time `peek_changes*(…)` was called
111    /// and the latest state of the `crates.io` index repository, which is obtained by fetching
112    /// the remote called `origin` or whatever is configured for the current `HEAD` branch and lastly
113    /// what it should be based on knowledge about he crates index.
114    /// The [`Self::last_seen_reference()`] will not be created or updated.
115    /// The second field in the returned tuple is the commit object to which the changes were provided.
116    /// If one would set the [`Self::last_seen_reference()`] to that object, the effect is exactly the same
117    /// as if [`Self::fetch_changes()`] had been called.
118    ///
119    /// The `progress` and `should_interrupt` parameters are used to provide progress for fetches and allow
120    /// these operations to be interrupted gracefully.
121    ///
122    /// # Resource Usage
123    ///
124    /// As this method fetches the git repository, loose objects or small packs may be created. Over time,
125    /// these will accumulate and either slow down subsequent operations, or cause them to fail due to exhaustion
126    /// of the maximum number of open file handles as configured with `ulimit`.
127    ///
128    /// Thus it is advised for the caller to run `git gc` occasionally based on their own requirements and usage patterns.
129    // TODO: update this once it's clear how auto-gc works in `gitoxide`.
130    pub fn peek_changes_with_options<P>(
131        &self,
132        mut progress: P,
133        should_interrupt: &AtomicBool,
134        order: Order,
135    ) -> Result<(Vec<Change>, gix::hash::ObjectId), Error>
136    where
137        P: gix::NestedProgress,
138        P::SubProgress: 'static,
139    {
140        let repo = &self.repo;
141        let from = repo
142            .find_reference(self.seen_ref_name)
143            .ok()
144            .and_then(|r| r.try_id().map(|id| id.detach()))
145            .unwrap_or_else(|| gix::hash::ObjectId::empty_tree(repo.object_hash()));
146        let to = {
147            let mut remote = self
148                .remote_name
149                .as_deref()
150                .and_then(|name| {
151                    self.repo.find_remote(name.as_bstr()).ok().or_else(|| {
152                        self.repo
153                            .head()
154                            .ok()
155                            .and_then(|head| {
156                                head.into_remote(gix::remote::Direction::Fetch)
157                                    .and_then(|r| r.ok())
158                            })
159                            .or_else(|| {
160                                self.repo
161                                    .find_default_remote(gix::remote::Direction::Fetch)
162                                    .and_then(|r| r.ok())
163                            })
164                    })
165                })
166                .map(Ok)
167                .unwrap_or_else(|| {
168                    self.repo
169                        .head()?
170                        .into_remote(gix::remote::Direction::Fetch)
171                        .map(|r| r.map_err(Error::from))
172                        .or_else(|| {
173                            self.repo
174                                .find_default_remote(gix::remote::Direction::Fetch)
175                                .map(|r| r.map_err(Error::from))
176                        })
177                        .unwrap_or_else(|| {
178                            self.repo
179                                .remote_at("https://github.com/rust-lang/crates.io-index")
180                                .map_err(Into::into)
181                        })
182                })?;
183            if remote.refspecs(gix::remote::Direction::Fetch).is_empty() {
184                let spec = format!(
185                    "+refs/heads/{branch}:refs/remotes/{remote}/{branch}",
186                    remote = self
187                        .remote_name
188                        .as_ref()
189                        .map(|n| n.as_bstr())
190                        .unwrap_or("origin".into()),
191                    branch = self.branch_name,
192                );
193                remote
194                    .replace_refspecs(Some(spec.as_str()), gix::remote::Direction::Fetch)
195                    .expect("valid statically known refspec");
196            }
197
198            let (url, _) = remote.sanitized_url_and_version(gix::remote::Direction::Fetch)?;
199            if matches!(
200                github::has_changes(&url, &from, self.branch_name)?,
201                github::FastPath::UpToDate
202            ) {
203                from
204            } else {
205                let res: gix::remote::fetch::Outcome = remote
206                    .connect(gix::remote::Direction::Fetch)?
207                    .prepare_fetch(&mut progress, Default::default())?
208                    .receive(&mut progress, should_interrupt)?;
209                let branch_name = format!("refs/heads/{}", self.branch_name);
210                let local_tracking = res
211                    .ref_map
212                    .mappings
213                    .iter()
214                    .find_map(|m| match &m.remote {
215                        gix::remote::fetch::refmap::Source::Ref(r) => (r.unpack().0 == branch_name)
216                            .then_some(m.local.as_ref())
217                            .flatten(),
218                        _ => None,
219                    })
220                    .ok_or_else(|| Error::NoMatchingBranch {
221                        name: branch_name,
222                        mappings: res.ref_map.mappings.clone(),
223                    })?;
224                self.repo
225                    .find_reference(local_tracking)
226                    .expect("local tracking branch exists if we see it here")
227                    .id()
228                    .detach()
229            }
230        };
231
232        Ok((
233            match order {
234                Order::ImplementationDefined => self.changes_between_commits(from, to)?,
235                Order::AsInCratesIndex => self.changes_between_ancestor_commits(from, to)?.0,
236            },
237            to,
238        ))
239    }
240
241    /// Similar to [`Self::changes()`], but requires `from` and `to` objects to be provided. They may point
242    /// to either `Commit`s or `Tree`s.
243    ///
244    /// # Returns
245    ///
246    /// A list of atomic changes that were performed on the index
247    /// between the two revisions.
248    ///
249    /// # Grouping and Ordering
250    ///
251    /// The changes are grouped by the crate they belong to.
252    /// The order of the changes for each crate is **deterministic** as they are ordered by line number, ascending.
253    /// The order of crates is **non-deterministic**.
254    ///
255    /// If a specific order is required, the changes must be sorted by the caller.
256    pub fn changes_between_commits(
257        &self,
258        from: impl Into<gix::hash::ObjectId>,
259        to: impl Into<gix::hash::ObjectId>,
260    ) -> Result<Vec<Change>, Error> {
261        let into_tree = |id: gix::hash::ObjectId| -> Result<gix::Tree<'_>, Error> {
262            Ok(id
263                .attach(&self.repo)
264                .object()?
265                .peel_to_kind(gix::object::Kind::Tree)?
266                .into_tree())
267        };
268        let from = into_tree(from.into())?;
269        let to = into_tree(to.into())?;
270        let mut delegate = Delegate::default();
271        from.changes()?
272            .options(|opts| {
273                opts.track_rewrites(None).track_filename();
274            })
275            .for_each_to_obtain_tree(&to, |change| delegate.handle(change))?;
276        delegate.into_result()
277    }
278
279    /// Similar to [`Self::changes()`], but requires `ancestor_commit` and `current_commit` objects to be provided
280    /// with `ancestor_commit` being in the ancestry of `current_commit`.
281    ///
282    /// If the invariants regarding `ancestor_commit` and `current_commit` are not upheld, we fallback
283    /// to `changes_between_commits()` which doesn't have such restrictions.
284    /// This can happen if the crates-index was squashed for instance.
285    ///
286    /// # Returns
287    ///
288    /// A list of atomic changes that were performed on the index
289    /// between the two revisions, but looking at it one commit at a time, along with the `Order`
290    /// that the changes are actually in in case one of the invariants wasn't met.
291    ///
292    /// # Grouping and Ordering
293    ///
294    /// Note that the order of the changes for each crate is **deterministic**, should they happen within one commit,
295    /// as the ordering is imposed to be by line number, ascending.
296    /// Typically one commit does not span multiple crates, but if it does, for instance when rollups happen,
297    /// then the order of crates is also **non-deterministic**.
298    ///
299    pub fn changes_between_ancestor_commits(
300        &self,
301        ancestor_commit: impl Into<gix::hash::ObjectId>,
302        current_commit: impl Into<gix::hash::ObjectId>,
303    ) -> Result<(Vec<Change>, Order), Error> {
304        let from_commit = ancestor_commit.into();
305        let to_commit = current_commit.into();
306        match self.commit_ancestry(from_commit, to_commit) {
307            Some(commits) => {
308                let mut changes = Vec::new();
309                for from_to in commits.windows(2) {
310                    let from = from_to[0];
311                    let to = from_to[1];
312                    changes.extend(self.changes_between_commits(from, to)?);
313                }
314                Ok((changes, Order::AsInCratesIndex))
315            }
316            None => self
317                .changes_between_commits(from_commit, to_commit)
318                .map(|c| (c, Order::ImplementationDefined)),
319        }
320    }
321
322    /// Return a list of commits like `from_commit..=to_commits`.
323    fn commit_ancestry(
324        &self,
325        ancestor_commit: gix::hash::ObjectId,
326        current_commit: gix::hash::ObjectId,
327    ) -> Option<Vec<gix::hash::ObjectId>> {
328        let seconds = ancestor_commit
329            .attach(&self.repo)
330            .object()
331            .ok()?
332            .try_into_commit()
333            .ok()?
334            .committer()
335            .ok()?
336            .seconds();
337        let mut commits = current_commit
338            .attach(&self.repo)
339            .ancestors()
340            .sorting(gix::revision::walk::Sorting::ByCommitTimeCutoff {
341                seconds,
342                order: CommitTimeOrder::NewestFirst,
343            })
344            .first_parent_only()
345            .all()
346            .ok()?
347            .map(|c| c.map(|c| c.id))
348            .collect::<Result<Vec<_>, _>>()
349            .ok()?;
350
351        commits.reverse();
352        if *commits.first()? != ancestor_commit {
353            // try harder, commit resolution is just a second.
354            let pos = commits.iter().position(|c| *c == ancestor_commit)?;
355            commits = commits[pos..].into();
356        }
357        assert_eq!(
358            commits[commits.len() - 1],
359            current_commit,
360            "the iterator includes the tips"
361        );
362        Some(commits)
363    }
364}
365
366/// Find changes while changing the underlying repository in one way or another.
367impl Index {
368    /// As `fetch_changes_with_options()`, but without the options.
369    pub fn fetch_changes(&self) -> Result<Vec<Change>, Error> {
370        self.fetch_changes_with_options(
371            gix::progress::Discard,
372            &AtomicBool::default(),
373            Order::ImplementationDefined,
374        )
375    }
376
377    /// As `fetch_changes()`, but returns an ordered result.
378    pub fn fetch_changes_ordered(&self) -> Result<Vec<Change>, Error> {
379        self.fetch_changes_with_options(
380            gix::progress::Discard,
381            &AtomicBool::default(),
382            Order::AsInCratesIndex,
383        )
384    }
385
386    /// Return all [`Change`]s that are observed between the last time this method was called
387    /// and the latest state of the `crates.io` index repository, which is obtained by fetching
388    /// the remote called `origin`.
389    /// The [`Self::last_seen_reference()`] will be created or adjusted to point to the latest fetched
390    /// state, which causes this method to have a different result each time it is called.
391    ///
392    /// The `progress` and `should_interrupt` parameters are used to provide progress for fetches and allow
393    /// these operations to be interrupted gracefully.
394    ///
395    /// `order` configures how changes should be ordered.
396    ///
397    /// # Resource Usage
398    ///
399    /// As this method fetches the git repository, loose objects or small packs may be created. Over time,
400    /// these will accumulate and either slow down subsequent operations, or cause them to fail due to exhaustion
401    /// of the maximum number of open file handles as configured with `ulimit`.
402    ///
403    /// Thus it is advised for the caller to run `git gc` occasionally based on their own requirements and usage patterns.
404    pub fn fetch_changes_with_options<P>(
405        &self,
406        progress: P,
407        should_interrupt: &AtomicBool,
408        order: Order,
409    ) -> Result<Vec<Change>, Error>
410    where
411        P: gix::NestedProgress,
412        P::SubProgress: 'static,
413    {
414        let (changes, to) = self.peek_changes_with_options(progress, should_interrupt, order)?;
415        self.set_last_seen_reference(to)?;
416        Ok(changes)
417    }
418
419    /// Set the last seen reference to the given Oid. It will be created if it does not yet exists.
420    pub fn set_last_seen_reference(&self, to: gix::hash::ObjectId) -> Result<(), Error> {
421        let repo = self.repository();
422        repo.reference(
423            self.seen_ref_name,
424            to,
425            gix::refs::transaction::PreviousValue::Any,
426            "updating seen-ref head to latest fetched commit",
427        )?;
428        Ok(())
429    }
430
431    /// Return all `CreateVersion`s observed between `from` and `to`. Both parameter are ref-specs
432    /// pointing to either a commit or a tree.
433    /// Learn more about specifying revisions
434    /// in the
435    /// [official documentation](https://www.kernel.org/pub/software/scm/git/docs/gitrevisions.html)
436    ///
437    /// # Returns
438    ///
439    /// A list of atomic chanes that were performed on the index
440    /// between the two revisions.
441    ///
442    /// # Grouping and Ordering
443    ///
444    /// The changes are grouped by the crate they belong to.
445    /// The order of the changes for each crate is **deterministic** as they are ordered by line number, ascending.
446    /// The order of crates is also **non-deterministic**.
447    ///
448    /// If a specific order is required, the changes must be sorted by the caller.
449    pub fn changes(
450        &self,
451        from: impl AsRef<str>,
452        to: impl AsRef<str>,
453    ) -> Result<Vec<Change>, Error> {
454        let repo = self.repository();
455        let from = repo
456            .rev_parse(from.as_ref())?
457            .single()
458            .expect("revspec was not a range")
459            .detach();
460        let to = repo
461            .rev_parse(to.as_ref())?
462            .single()
463            .expect("revspec was not a range")
464            .detach();
465        self.changes_between_commits(from, to)
466    }
467}