crates_index_diff/index/diff/
mod.rs

1use crate::{Change, Index};
2use bstr::ByteSlice;
3use gix::prelude::ObjectIdExt;
4use gix::traverse::commit::simple::CommitTimeOrder;
5use std::sync::atomic::AtomicBool;
6
7mod delegate;
8mod github;
9
10use delegate::Delegate;
11
12/// The order we maintain for the produced changes.
13#[derive(Debug, Copy, Clone, Eq, PartialEq)]
14pub enum Order {
15    /// Compare provided trees or commits without applying any other logic, with the order being influenced by
16    /// factors like hashmaps.
17    ///
18    /// The benefit is mode is the optimal performance as only one diff is created.
19    ImplementationDefined,
20    /// If the provided revisions are commits, single step through the history that connects them to maintain
21    /// the order in which changes were submitted to the crates-index for all user-defined changes.
22    ///
23    /// Admin changes are still implementation defined, but typically involve only deletions.
24    ///
25    /// The shortcomings of this approach is that each pair of commits has to be diffed individually, increasing
26    /// the amount of work linearly.
27    AsInCratesIndex,
28}
29
30/// The error returned by methods dealing with obtaining index changes.
31#[derive(Debug, thiserror::Error)]
32#[allow(missing_docs)]
33pub enum Error {
34    #[error("Couldn't update marker reference")]
35    ReferenceEdit(#[from] gix::reference::edit::Error),
36    #[error("Failed to parse rev-spec to determine which revisions to diff")]
37    RevParse(#[from] gix::revision::spec::parse::Error),
38    #[error(transparent)]
39    DiffRewrites(#[from] gix::diff::new_rewrites::Error),
40    #[error("Couldn't find blob that showed up when diffing trees")]
41    FindObject(#[from] gix::object::find::existing::Error),
42    #[error("Couldn't get the tree of a commit for diffing purposes")]
43    PeelToTree(#[from] gix::object::peel::to_kind::Error),
44    #[error("Failed to diff two trees to find changed crates")]
45    Diff(#[from] gix::diff::options::init::Error),
46    #[error(transparent)]
47    DiffForEach(#[from] gix::object::tree::diff::for_each::Error),
48    #[error("Failed to decode {line:?} in file {file_name:?} as crate version")]
49    VersionDecode {
50        source: serde_json::Error,
51        file_name: bstr::BString,
52        line: bstr::BString,
53    },
54    #[error(transparent)]
55    FindRemote(#[from] gix::remote::find::existing::Error),
56    #[error(transparent)]
57    FindReference(#[from] gix::reference::find::existing::Error),
58    #[error(transparent)]
59    Connect(#[from] gix::remote::connect::Error),
60    #[error(transparent)]
61    PrepareFetch(#[from] gix::remote::fetch::prepare::Error),
62    #[error(transparent)]
63    Fetch(#[from] gix::remote::fetch::Error),
64    #[error(transparent)]
65    InitAnonymousRemote(#[from] gix::remote::init::Error),
66    #[error("Could not find local tracking branch for remote branch {name:?} in any of {} fetched refs", mappings.len()
67    )]
68    NoMatchingBranch {
69        name: String,
70        mappings: Vec<gix::remote::fetch::refmap::Mapping>,
71    },
72    #[error("Error when fetching GitHub fastpath.")]
73    GithubFetch(#[from] reqwest::Error),
74}
75
76/// Find changes without modifying the underling repository
77impl Index {
78    /// As `peek_changes_with_options()`, but without the options.
79    pub fn peek_changes(&self) -> Result<(Vec<Change>, gix::hash::ObjectId), Error> {
80        self.peek_changes_with_options(
81            gix::progress::Discard,
82            &AtomicBool::default(),
83            Order::ImplementationDefined,
84        )
85    }
86
87    /// As `peek_changes()` but provides changes similar to those in the crates index.
88    pub fn peek_changes_ordered(&self) -> Result<(Vec<Change>, gix::hash::ObjectId), Error> {
89        self.peek_changes_with_options(
90            gix::progress::Discard,
91            &AtomicBool::default(),
92            Order::AsInCratesIndex,
93        )
94    }
95
96    /// Return all [`Change`]s that are observed between the last time `peek_changes*(…)` was called
97    /// and the latest state of the `crates.io` index repository, which is obtained by fetching
98    /// the remote called `origin` or whatever is configured for the current `HEAD` branch and lastly
99    /// what it should be based on knowledge about he crates index.
100    /// The [`Self::last_seen_reference()`] will not be created or updated.
101    /// The second field in the returned tuple is the commit object to which the changes were provided.
102    /// If one would set the [`Self::last_seen_reference()`] to that object, the effect is exactly the same
103    /// as if [`Self::fetch_changes()`] had been called.
104    ///
105    /// The `progress` and `should_interrupt` parameters are used to provide progress for fetches and allow
106    /// these operations to be interrupted gracefully.
107    ///
108    /// # Resource Usage
109    ///
110    /// As this method fetches the git repository, loose objects or small packs may be created. Over time,
111    /// these will accumulate and either slow down subsequent operations, or cause them to fail due to exhaustion
112    /// of the maximum number of open file handles as configured with `ulimit`.
113    ///
114    /// Thus it is advised for the caller to run `git gc` occasionally based on their own requirements and usage patterns.
115    // TODO: update this once it's clear how auto-gc works in `gitoxide`.
116    pub fn peek_changes_with_options<P>(
117        &self,
118        mut progress: P,
119        should_interrupt: &AtomicBool,
120        order: Order,
121    ) -> Result<(Vec<Change>, gix::hash::ObjectId), Error>
122    where
123        P: gix::NestedProgress,
124        P::SubProgress: 'static,
125    {
126        let repo = &self.repo;
127        let from = repo
128            .find_reference(self.seen_ref_name)
129            .ok()
130            .and_then(|r| r.try_id().map(|id| id.detach()))
131            .unwrap_or_else(|| gix::hash::ObjectId::empty_tree(repo.object_hash()));
132        let to = {
133            let mut remote = self
134                .remote_name
135                .as_deref()
136                .and_then(|name| {
137                    self.repo.find_remote(name.as_bstr()).ok().or_else(|| {
138                        self.repo
139                            .head()
140                            .ok()
141                            .and_then(|head| {
142                                head.into_remote(gix::remote::Direction::Fetch)
143                                    .and_then(|r| r.ok())
144                            })
145                            .or_else(|| {
146                                self.repo
147                                    .find_default_remote(gix::remote::Direction::Fetch)
148                                    .and_then(|r| r.ok())
149                            })
150                    })
151                })
152                .map(Ok)
153                .unwrap_or_else(|| {
154                    self.repo
155                        .head()?
156                        .into_remote(gix::remote::Direction::Fetch)
157                        .map(|r| r.map_err(Error::from))
158                        .or_else(|| {
159                            self.repo
160                                .find_default_remote(gix::remote::Direction::Fetch)
161                                .map(|r| r.map_err(Error::from))
162                        })
163                        .unwrap_or_else(|| {
164                            self.repo
165                                .remote_at("https://github.com/rust-lang/crates.io-index")
166                                .map_err(Into::into)
167                        })
168                })?;
169            if remote.refspecs(gix::remote::Direction::Fetch).is_empty() {
170                let spec = format!(
171                    "+refs/heads/{branch}:refs/remotes/{remote}/{branch}",
172                    remote = self
173                        .remote_name
174                        .as_ref()
175                        .map(|n| n.as_bstr())
176                        .unwrap_or("origin".into()),
177                    branch = self.branch_name,
178                );
179                remote
180                    .replace_refspecs(Some(spec.as_str()), gix::remote::Direction::Fetch)
181                    .expect("valid statically known refspec");
182            }
183
184            let (url, _) = remote.sanitized_url_and_version(gix::remote::Direction::Fetch)?;
185            if matches!(
186                github::has_changes(&url, &from, self.branch_name)?,
187                github::FastPath::UpToDate
188            ) {
189                from.clone()
190            } else {
191                let res: gix::remote::fetch::Outcome = remote
192                    .connect(gix::remote::Direction::Fetch)?
193                    .prepare_fetch(&mut progress, Default::default())?
194                    .receive(&mut progress, should_interrupt)?;
195                let branch_name = format!("refs/heads/{}", self.branch_name);
196                let local_tracking = res
197                    .ref_map
198                    .mappings
199                    .iter()
200                    .find_map(|m| match &m.remote {
201                        gix::remote::fetch::refmap::Source::Ref(r) => (r.unpack().0 == branch_name)
202                            .then_some(m.local.as_ref())
203                            .flatten(),
204                        _ => None,
205                    })
206                    .ok_or_else(|| Error::NoMatchingBranch {
207                        name: branch_name,
208                        mappings: res.ref_map.mappings.clone(),
209                    })?;
210                self.repo
211                    .find_reference(local_tracking)
212                    .expect("local tracking branch exists if we see it here")
213                    .id()
214                    .detach()
215            }
216        };
217
218        Ok((
219            match order {
220                Order::ImplementationDefined => self.changes_between_commits(from, to)?,
221                Order::AsInCratesIndex => self.changes_between_ancestor_commits(from, to)?.0,
222            },
223            to,
224        ))
225    }
226
227    /// Similar to [`Self::changes()`], but requires `from` and `to` objects to be provided. They may point
228    /// to either `Commit`s or `Tree`s.
229    ///
230    /// # Returns
231    ///
232    /// A list of atomic changes that were performed on the index
233    /// between the two revisions.
234    ///
235    /// # Grouping and Ordering
236    ///
237    /// The changes are grouped by the crate they belong to.
238    /// The order of the changes for each crate is **deterministic** as they are ordered by line number, ascending.
239    /// The order of crates is **non-deterministic**.
240    ///
241    /// If a specific order is required, the changes must be sorted by the caller.
242    pub fn changes_between_commits(
243        &self,
244        from: impl Into<gix::hash::ObjectId>,
245        to: impl Into<gix::hash::ObjectId>,
246    ) -> Result<Vec<Change>, Error> {
247        let into_tree = |id: gix::hash::ObjectId| -> Result<gix::Tree<'_>, Error> {
248            Ok(id
249                .attach(&self.repo)
250                .object()?
251                .peel_to_kind(gix::object::Kind::Tree)?
252                .into_tree())
253        };
254        let from = into_tree(from.into())?;
255        let to = into_tree(to.into())?;
256        let mut delegate = Delegate::default();
257        from.changes()?
258            .options(|opts| {
259                opts.track_rewrites(None).track_filename();
260            })
261            .for_each_to_obtain_tree(&to, |change| delegate.handle(change))?;
262        delegate.into_result()
263    }
264
265    /// Similar to [`Self::changes()`], but requires `ancestor_commit` and `current_commit` objects to be provided
266    /// with `ancestor_commit` being in the ancestry of `current_commit`.
267    ///
268    /// If the invariants regarding `ancestor_commit` and `current_commit` are not upheld, we fallback
269    /// to `changes_between_commits()` which doesn't have such restrictions.
270    /// This can happen if the crates-index was squashed for instance.
271    ///
272    /// # Returns
273    ///
274    /// A list of atomic changes that were performed on the index
275    /// between the two revisions, but looking at it one commit at a time, along with the `Order`
276    /// that the changes are actually in in case one of the invariants wasn't met.
277    ///
278    /// # Grouping and Ordering
279    ///
280    /// Note that the order of the changes for each crate is **deterministic**, should they happen within one commit,
281    /// as the ordering is imposed to be by line number, ascending.
282    /// Typically one commit does not span multiple crates, but if it does, for instance when rollups happen,
283    /// then the order of crates is also **non-deterministic**.
284    ///
285    pub fn changes_between_ancestor_commits(
286        &self,
287        ancestor_commit: impl Into<gix::hash::ObjectId>,
288        current_commit: impl Into<gix::hash::ObjectId>,
289    ) -> Result<(Vec<Change>, Order), Error> {
290        let from_commit = ancestor_commit.into();
291        let to_commit = current_commit.into();
292        match self.commit_ancestry(from_commit, to_commit) {
293            Some(commits) => {
294                let mut changes = Vec::new();
295                for from_to in commits.windows(2) {
296                    let from = from_to[0];
297                    let to = from_to[1];
298                    changes.extend(self.changes_between_commits(from, to)?);
299                }
300                Ok((changes, Order::AsInCratesIndex))
301            }
302            None => self
303                .changes_between_commits(from_commit, to_commit)
304                .map(|c| (c, Order::ImplementationDefined)),
305        }
306    }
307
308    /// Return a list of commits like `from_commit..=to_commits`.
309    fn commit_ancestry(
310        &self,
311        ancestor_commit: gix::hash::ObjectId,
312        current_commit: gix::hash::ObjectId,
313    ) -> Option<Vec<gix::hash::ObjectId>> {
314        let seconds = ancestor_commit
315            .attach(&self.repo)
316            .object()
317            .ok()?
318            .try_into_commit()
319            .ok()?
320            .committer()
321            .ok()?
322            .seconds();
323        let mut commits = current_commit
324            .attach(&self.repo)
325            .ancestors()
326            .sorting(gix::revision::walk::Sorting::ByCommitTimeCutoff {
327                seconds,
328                order: CommitTimeOrder::NewestFirst,
329            })
330            .first_parent_only()
331            .all()
332            .ok()?
333            .map(|c| c.map(|c| c.id))
334            .collect::<Result<Vec<_>, _>>()
335            .ok()?;
336
337        commits.reverse();
338        if *commits.first()? != ancestor_commit {
339            // try harder, commit resolution is just a second.
340            let pos = commits.iter().position(|c| *c == ancestor_commit)?;
341            commits = commits[pos..].into();
342        }
343        assert_eq!(
344            commits[commits.len() - 1],
345            current_commit,
346            "the iterator includes the tips"
347        );
348        Some(commits)
349    }
350}
351
352/// Find changes while changing the underlying repository in one way or another.
353impl Index {
354    /// As `fetch_changes_with_options()`, but without the options.
355    pub fn fetch_changes(&self) -> Result<Vec<Change>, Error> {
356        self.fetch_changes_with_options(
357            gix::progress::Discard,
358            &AtomicBool::default(),
359            Order::ImplementationDefined,
360        )
361    }
362
363    /// As `fetch_changes()`, but returns an ordered result.
364    pub fn fetch_changes_ordered(&self) -> Result<Vec<Change>, Error> {
365        self.fetch_changes_with_options(
366            gix::progress::Discard,
367            &AtomicBool::default(),
368            Order::AsInCratesIndex,
369        )
370    }
371
372    /// Return all [`Change`]s that are observed between the last time this method was called
373    /// and the latest state of the `crates.io` index repository, which is obtained by fetching
374    /// the remote called `origin`.
375    /// The [`Self::last_seen_reference()`] will be created or adjusted to point to the latest fetched
376    /// state, which causes this method to have a different result each time it is called.
377    ///
378    /// The `progress` and `should_interrupt` parameters are used to provide progress for fetches and allow
379    /// these operations to be interrupted gracefully.
380    ///
381    /// `order` configures how changes should be ordered.
382    ///
383    /// # Resource Usage
384    ///
385    /// As this method fetches the git repository, loose objects or small packs may be created. Over time,
386    /// these will accumulate and either slow down subsequent operations, or cause them to fail due to exhaustion
387    /// of the maximum number of open file handles as configured with `ulimit`.
388    ///
389    /// Thus it is advised for the caller to run `git gc` occasionally based on their own requirements and usage patterns.
390    pub fn fetch_changes_with_options<P>(
391        &self,
392        progress: P,
393        should_interrupt: &AtomicBool,
394        order: Order,
395    ) -> Result<Vec<Change>, Error>
396    where
397        P: gix::NestedProgress,
398        P::SubProgress: 'static,
399    {
400        let (changes, to) = self.peek_changes_with_options(progress, should_interrupt, order)?;
401        self.set_last_seen_reference(to)?;
402        Ok(changes)
403    }
404
405    /// Set the last seen reference to the given Oid. It will be created if it does not yet exists.
406    pub fn set_last_seen_reference(&self, to: gix::hash::ObjectId) -> Result<(), Error> {
407        let repo = self.repository();
408        repo.reference(
409            self.seen_ref_name,
410            to,
411            gix::refs::transaction::PreviousValue::Any,
412            "updating seen-ref head to latest fetched commit",
413        )?;
414        Ok(())
415    }
416
417    /// Return all `CreateVersion`s observed between `from` and `to`. Both parameter are ref-specs
418    /// pointing to either a commit or a tree.
419    /// Learn more about specifying revisions
420    /// in the
421    /// [official documentation](https://www.kernel.org/pub/software/scm/git/docs/gitrevisions.html)
422    ///
423    /// # Returns
424    ///
425    /// A list of atomic chanes that were performed on the index
426    /// between the two revisions.
427    ///
428    /// # Grouping and Ordering
429    ///
430    /// The changes are grouped by the crate they belong to.
431    /// The order of the changes for each crate is **deterministic** as they are ordered by line number, ascending.
432    /// The order of crates is also **non-deterministic**.
433    ///
434    /// If a specific order is required, the changes must be sorted by the caller.
435    pub fn changes(
436        &self,
437        from: impl AsRef<str>,
438        to: impl AsRef<str>,
439    ) -> Result<Vec<Change>, Error> {
440        let repo = self.repository();
441        let from = repo
442            .rev_parse(from.as_ref())?
443            .single()
444            .expect("revspec was not a range")
445            .detach();
446        let to = repo
447            .rev_parse(to.as_ref())?
448            .single()
449            .expect("revspec was not a range")
450            .detach();
451        self.changes_between_commits(from, to)
452    }
453}