crates_index_diff/index/diff/mod.rs
1use crate::{Change, Index};
2use bstr::ByteSlice;
3use gix::prelude::ObjectIdExt;
4use gix::traverse::commit::simple::CommitTimeOrder;
5use std::sync::atomic::AtomicBool;
6
7mod delegate;
8mod github;
9
10use delegate::Delegate;
11
12/// The order we maintain for the produced changes.
13#[derive(Debug, Copy, Clone, Eq, PartialEq)]
14pub enum Order {
15 /// Compare provided trees or commits without applying any other logic, with the order being influenced by
16 /// factors like hashmaps.
17 ///
18 /// The benefit is mode is the optimal performance as only one diff is created.
19 ImplementationDefined,
20 /// If the provided revisions are commits, single step through the history that connects them to maintain
21 /// the order in which changes were submitted to the crates-index for all user-defined changes.
22 ///
23 /// Admin changes are still implementation defined, but typically involve only deletions.
24 ///
25 /// The shortcomings of this approach is that each pair of commits has to be diffed individually, increasing
26 /// the amount of work linearly.
27 AsInCratesIndex,
28}
29
30/// The error returned by methods dealing with obtaining index changes.
31#[derive(Debug, thiserror::Error)]
32#[allow(missing_docs)]
33pub enum Error {
34 #[error("Couldn't update marker reference")]
35 ReferenceEdit(#[from] gix::reference::edit::Error),
36 #[error("Failed to parse rev-spec to determine which revisions to diff")]
37 RevParse(#[from] gix::revision::spec::parse::Error),
38 #[error(transparent)]
39 DiffRewrites(#[from] gix::diff::new_rewrites::Error),
40 #[error("Couldn't find blob that showed up when diffing trees")]
41 FindObject(#[from] gix::object::find::existing::Error),
42 #[error("Couldn't get the tree of a commit for diffing purposes")]
43 PeelToTree(#[from] gix::object::peel::to_kind::Error),
44 #[error("Failed to diff two trees to find changed crates")]
45 Diff(#[from] gix::diff::options::init::Error),
46 #[error(transparent)]
47 DiffForEach(#[from] gix::object::tree::diff::for_each::Error),
48 #[error("Failed to decode {line:?} in file {file_name:?} as crate version")]
49 VersionDecode {
50 source: serde_json::Error,
51 file_name: bstr::BString,
52 line: bstr::BString,
53 },
54 #[error(transparent)]
55 FindRemote(#[from] gix::remote::find::existing::Error),
56 #[error(transparent)]
57 FindReference(#[from] gix::reference::find::existing::Error),
58 #[error(transparent)]
59 Connect(#[from] gix::remote::connect::Error),
60 #[error(transparent)]
61 PrepareFetch(#[from] gix::remote::fetch::prepare::Error),
62 #[error(transparent)]
63 Fetch(#[from] gix::remote::fetch::Error),
64 #[error(transparent)]
65 InitAnonymousRemote(#[from] gix::remote::init::Error),
66 #[error("Could not find local tracking branch for remote branch {name:?} in any of {} fetched refs", mappings.len()
67 )]
68 NoMatchingBranch {
69 name: String,
70 mappings: Vec<gix::remote::fetch::refmap::Mapping>,
71 },
72 #[error("Error when fetching GitHub fastpath.")]
73 GithubFetch(#[from] reqwest::Error),
74}
75
76/// Find changes without modifying the underling repository
77impl Index {
78 /// As `peek_changes_with_options()`, but without the options.
79 pub fn peek_changes(&self) -> Result<(Vec<Change>, gix::hash::ObjectId), Error> {
80 self.peek_changes_with_options(
81 gix::progress::Discard,
82 &AtomicBool::default(),
83 Order::ImplementationDefined,
84 )
85 }
86
87 /// As `peek_changes()` but provides changes similar to those in the crates index.
88 pub fn peek_changes_ordered(&self) -> Result<(Vec<Change>, gix::hash::ObjectId), Error> {
89 self.peek_changes_with_options(
90 gix::progress::Discard,
91 &AtomicBool::default(),
92 Order::AsInCratesIndex,
93 )
94 }
95
96 /// Return all [`Change`]s that are observed between the last time `peek_changes*(…)` was called
97 /// and the latest state of the `crates.io` index repository, which is obtained by fetching
98 /// the remote called `origin` or whatever is configured for the current `HEAD` branch and lastly
99 /// what it should be based on knowledge about he crates index.
100 /// The [`Self::last_seen_reference()`] will not be created or updated.
101 /// The second field in the returned tuple is the commit object to which the changes were provided.
102 /// If one would set the [`Self::last_seen_reference()`] to that object, the effect is exactly the same
103 /// as if [`Self::fetch_changes()`] had been called.
104 ///
105 /// The `progress` and `should_interrupt` parameters are used to provide progress for fetches and allow
106 /// these operations to be interrupted gracefully.
107 ///
108 /// # Resource Usage
109 ///
110 /// As this method fetches the git repository, loose objects or small packs may be created. Over time,
111 /// these will accumulate and either slow down subsequent operations, or cause them to fail due to exhaustion
112 /// of the maximum number of open file handles as configured with `ulimit`.
113 ///
114 /// Thus it is advised for the caller to run `git gc` occasionally based on their own requirements and usage patterns.
115 // TODO: update this once it's clear how auto-gc works in `gitoxide`.
116 pub fn peek_changes_with_options<P>(
117 &self,
118 mut progress: P,
119 should_interrupt: &AtomicBool,
120 order: Order,
121 ) -> Result<(Vec<Change>, gix::hash::ObjectId), Error>
122 where
123 P: gix::NestedProgress,
124 P::SubProgress: 'static,
125 {
126 let repo = &self.repo;
127 let from = repo
128 .find_reference(self.seen_ref_name)
129 .ok()
130 .and_then(|r| r.try_id().map(|id| id.detach()))
131 .unwrap_or_else(|| gix::hash::ObjectId::empty_tree(repo.object_hash()));
132 let to = {
133 let mut remote = self
134 .remote_name
135 .as_deref()
136 .and_then(|name| {
137 self.repo.find_remote(name.as_bstr()).ok().or_else(|| {
138 self.repo
139 .head()
140 .ok()
141 .and_then(|head| {
142 head.into_remote(gix::remote::Direction::Fetch)
143 .and_then(|r| r.ok())
144 })
145 .or_else(|| {
146 self.repo
147 .find_default_remote(gix::remote::Direction::Fetch)
148 .and_then(|r| r.ok())
149 })
150 })
151 })
152 .map(Ok)
153 .unwrap_or_else(|| {
154 self.repo
155 .head()?
156 .into_remote(gix::remote::Direction::Fetch)
157 .map(|r| r.map_err(Error::from))
158 .or_else(|| {
159 self.repo
160 .find_default_remote(gix::remote::Direction::Fetch)
161 .map(|r| r.map_err(Error::from))
162 })
163 .unwrap_or_else(|| {
164 self.repo
165 .remote_at("https://github.com/rust-lang/crates.io-index")
166 .map_err(Into::into)
167 })
168 })?;
169 if remote.refspecs(gix::remote::Direction::Fetch).is_empty() {
170 let spec = format!(
171 "+refs/heads/{branch}:refs/remotes/{remote}/{branch}",
172 remote = self
173 .remote_name
174 .as_ref()
175 .map(|n| n.as_bstr())
176 .unwrap_or("origin".into()),
177 branch = self.branch_name,
178 );
179 remote
180 .replace_refspecs(Some(spec.as_str()), gix::remote::Direction::Fetch)
181 .expect("valid statically known refspec");
182 }
183
184 let (url, _) = remote.sanitized_url_and_version(gix::remote::Direction::Fetch)?;
185 if matches!(
186 github::has_changes(&url, &from, self.branch_name)?,
187 github::FastPath::UpToDate
188 ) {
189 from.clone()
190 } else {
191 let res: gix::remote::fetch::Outcome = remote
192 .connect(gix::remote::Direction::Fetch)?
193 .prepare_fetch(&mut progress, Default::default())?
194 .receive(&mut progress, should_interrupt)?;
195 let branch_name = format!("refs/heads/{}", self.branch_name);
196 let local_tracking = res
197 .ref_map
198 .mappings
199 .iter()
200 .find_map(|m| match &m.remote {
201 gix::remote::fetch::refmap::Source::Ref(r) => (r.unpack().0 == branch_name)
202 .then_some(m.local.as_ref())
203 .flatten(),
204 _ => None,
205 })
206 .ok_or_else(|| Error::NoMatchingBranch {
207 name: branch_name,
208 mappings: res.ref_map.mappings.clone(),
209 })?;
210 self.repo
211 .find_reference(local_tracking)
212 .expect("local tracking branch exists if we see it here")
213 .id()
214 .detach()
215 }
216 };
217
218 Ok((
219 match order {
220 Order::ImplementationDefined => self.changes_between_commits(from, to)?,
221 Order::AsInCratesIndex => self.changes_between_ancestor_commits(from, to)?.0,
222 },
223 to,
224 ))
225 }
226
227 /// Similar to [`Self::changes()`], but requires `from` and `to` objects to be provided. They may point
228 /// to either `Commit`s or `Tree`s.
229 ///
230 /// # Returns
231 ///
232 /// A list of atomic changes that were performed on the index
233 /// between the two revisions.
234 ///
235 /// # Grouping and Ordering
236 ///
237 /// The changes are grouped by the crate they belong to.
238 /// The order of the changes for each crate is **deterministic** as they are ordered by line number, ascending.
239 /// The order of crates is **non-deterministic**.
240 ///
241 /// If a specific order is required, the changes must be sorted by the caller.
242 pub fn changes_between_commits(
243 &self,
244 from: impl Into<gix::hash::ObjectId>,
245 to: impl Into<gix::hash::ObjectId>,
246 ) -> Result<Vec<Change>, Error> {
247 let into_tree = |id: gix::hash::ObjectId| -> Result<gix::Tree<'_>, Error> {
248 Ok(id
249 .attach(&self.repo)
250 .object()?
251 .peel_to_kind(gix::object::Kind::Tree)?
252 .into_tree())
253 };
254 let from = into_tree(from.into())?;
255 let to = into_tree(to.into())?;
256 let mut delegate = Delegate::default();
257 from.changes()?
258 .options(|opts| {
259 opts.track_rewrites(None).track_filename();
260 })
261 .for_each_to_obtain_tree(&to, |change| delegate.handle(change))?;
262 delegate.into_result()
263 }
264
265 /// Similar to [`Self::changes()`], but requires `ancestor_commit` and `current_commit` objects to be provided
266 /// with `ancestor_commit` being in the ancestry of `current_commit`.
267 ///
268 /// If the invariants regarding `ancestor_commit` and `current_commit` are not upheld, we fallback
269 /// to `changes_between_commits()` which doesn't have such restrictions.
270 /// This can happen if the crates-index was squashed for instance.
271 ///
272 /// # Returns
273 ///
274 /// A list of atomic changes that were performed on the index
275 /// between the two revisions, but looking at it one commit at a time, along with the `Order`
276 /// that the changes are actually in in case one of the invariants wasn't met.
277 ///
278 /// # Grouping and Ordering
279 ///
280 /// Note that the order of the changes for each crate is **deterministic**, should they happen within one commit,
281 /// as the ordering is imposed to be by line number, ascending.
282 /// Typically one commit does not span multiple crates, but if it does, for instance when rollups happen,
283 /// then the order of crates is also **non-deterministic**.
284 ///
285 pub fn changes_between_ancestor_commits(
286 &self,
287 ancestor_commit: impl Into<gix::hash::ObjectId>,
288 current_commit: impl Into<gix::hash::ObjectId>,
289 ) -> Result<(Vec<Change>, Order), Error> {
290 let from_commit = ancestor_commit.into();
291 let to_commit = current_commit.into();
292 match self.commit_ancestry(from_commit, to_commit) {
293 Some(commits) => {
294 let mut changes = Vec::new();
295 for from_to in commits.windows(2) {
296 let from = from_to[0];
297 let to = from_to[1];
298 changes.extend(self.changes_between_commits(from, to)?);
299 }
300 Ok((changes, Order::AsInCratesIndex))
301 }
302 None => self
303 .changes_between_commits(from_commit, to_commit)
304 .map(|c| (c, Order::ImplementationDefined)),
305 }
306 }
307
308 /// Return a list of commits like `from_commit..=to_commits`.
309 fn commit_ancestry(
310 &self,
311 ancestor_commit: gix::hash::ObjectId,
312 current_commit: gix::hash::ObjectId,
313 ) -> Option<Vec<gix::hash::ObjectId>> {
314 let seconds = ancestor_commit
315 .attach(&self.repo)
316 .object()
317 .ok()?
318 .try_into_commit()
319 .ok()?
320 .committer()
321 .ok()?
322 .seconds();
323 let mut commits = current_commit
324 .attach(&self.repo)
325 .ancestors()
326 .sorting(gix::revision::walk::Sorting::ByCommitTimeCutoff {
327 seconds,
328 order: CommitTimeOrder::NewestFirst,
329 })
330 .first_parent_only()
331 .all()
332 .ok()?
333 .map(|c| c.map(|c| c.id))
334 .collect::<Result<Vec<_>, _>>()
335 .ok()?;
336
337 commits.reverse();
338 if *commits.first()? != ancestor_commit {
339 // try harder, commit resolution is just a second.
340 let pos = commits.iter().position(|c| *c == ancestor_commit)?;
341 commits = commits[pos..].into();
342 }
343 assert_eq!(
344 commits[commits.len() - 1],
345 current_commit,
346 "the iterator includes the tips"
347 );
348 Some(commits)
349 }
350}
351
352/// Find changes while changing the underlying repository in one way or another.
353impl Index {
354 /// As `fetch_changes_with_options()`, but without the options.
355 pub fn fetch_changes(&self) -> Result<Vec<Change>, Error> {
356 self.fetch_changes_with_options(
357 gix::progress::Discard,
358 &AtomicBool::default(),
359 Order::ImplementationDefined,
360 )
361 }
362
363 /// As `fetch_changes()`, but returns an ordered result.
364 pub fn fetch_changes_ordered(&self) -> Result<Vec<Change>, Error> {
365 self.fetch_changes_with_options(
366 gix::progress::Discard,
367 &AtomicBool::default(),
368 Order::AsInCratesIndex,
369 )
370 }
371
372 /// Return all [`Change`]s that are observed between the last time this method was called
373 /// and the latest state of the `crates.io` index repository, which is obtained by fetching
374 /// the remote called `origin`.
375 /// The [`Self::last_seen_reference()`] will be created or adjusted to point to the latest fetched
376 /// state, which causes this method to have a different result each time it is called.
377 ///
378 /// The `progress` and `should_interrupt` parameters are used to provide progress for fetches and allow
379 /// these operations to be interrupted gracefully.
380 ///
381 /// `order` configures how changes should be ordered.
382 ///
383 /// # Resource Usage
384 ///
385 /// As this method fetches the git repository, loose objects or small packs may be created. Over time,
386 /// these will accumulate and either slow down subsequent operations, or cause them to fail due to exhaustion
387 /// of the maximum number of open file handles as configured with `ulimit`.
388 ///
389 /// Thus it is advised for the caller to run `git gc` occasionally based on their own requirements and usage patterns.
390 pub fn fetch_changes_with_options<P>(
391 &self,
392 progress: P,
393 should_interrupt: &AtomicBool,
394 order: Order,
395 ) -> Result<Vec<Change>, Error>
396 where
397 P: gix::NestedProgress,
398 P::SubProgress: 'static,
399 {
400 let (changes, to) = self.peek_changes_with_options(progress, should_interrupt, order)?;
401 self.set_last_seen_reference(to)?;
402 Ok(changes)
403 }
404
405 /// Set the last seen reference to the given Oid. It will be created if it does not yet exists.
406 pub fn set_last_seen_reference(&self, to: gix::hash::ObjectId) -> Result<(), Error> {
407 let repo = self.repository();
408 repo.reference(
409 self.seen_ref_name,
410 to,
411 gix::refs::transaction::PreviousValue::Any,
412 "updating seen-ref head to latest fetched commit",
413 )?;
414 Ok(())
415 }
416
417 /// Return all `CreateVersion`s observed between `from` and `to`. Both parameter are ref-specs
418 /// pointing to either a commit or a tree.
419 /// Learn more about specifying revisions
420 /// in the
421 /// [official documentation](https://www.kernel.org/pub/software/scm/git/docs/gitrevisions.html)
422 ///
423 /// # Returns
424 ///
425 /// A list of atomic chanes that were performed on the index
426 /// between the two revisions.
427 ///
428 /// # Grouping and Ordering
429 ///
430 /// The changes are grouped by the crate they belong to.
431 /// The order of the changes for each crate is **deterministic** as they are ordered by line number, ascending.
432 /// The order of crates is also **non-deterministic**.
433 ///
434 /// If a specific order is required, the changes must be sorted by the caller.
435 pub fn changes(
436 &self,
437 from: impl AsRef<str>,
438 to: impl AsRef<str>,
439 ) -> Result<Vec<Change>, Error> {
440 let repo = self.repository();
441 let from = repo
442 .rev_parse(from.as_ref())?
443 .single()
444 .expect("revspec was not a range")
445 .detach();
446 let to = repo
447 .rev_parse(to.as_ref())?
448 .single()
449 .expect("revspec was not a range")
450 .detach();
451 self.changes_between_commits(from, to)
452 }
453}