1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
use std::{io, marker::PhantomData, path::Path};
use bstr::BStr;
use filetime::FileTime;
use gix_features::parallel::{in_parallel_if, Reduce};
use crate::{
read,
status::{
content,
content::CompareBlobs,
types::{Error, Options},
Change, VisitEntry,
},
};
/// Calculates the changes that need to be applied to an `index` to match the state of the `worktree` and makes them
/// observable in `collector`, along with information produced by `compare` which gets to see blobs that may have changes.
/// `options` are used to configure the operation.
///
/// Note that `index` is updated with the latest seen stat information from the worktree, and its timestamp is adjusted to
/// the current time for which it will be considered fresh.
///
/// Note that this isn't technically quite what this function does as this also provides some additional information,
/// like whether a file has conflicts, and files that were added with `git add` are shown as a special
/// changes despite not technically requiring a change to the index since `git add` already added the file to the index.
pub fn status<'index, T, Find, E>(
index: &'index mut gix_index::State,
worktree: &Path,
collector: &mut impl VisitEntry<'index, ContentChange = T>,
compare: impl CompareBlobs<Output = T> + Send + Clone,
find: Find,
options: Options,
) -> Result<(), Error>
where
T: Send,
E: std::error::Error + Send + Sync + 'static,
Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E> + Send + Clone,
{
// the order is absolutely critical here we use the old timestamp to detect racy index entries
// (modified at or after the last index update) during the index update we then set those
// entries size to 0 (see below) to ensure they keep showing up as racy and reset the timestamp.
let timestamp = index.timestamp();
index.set_timestamp(FileTime::now());
let (chunk_size, thread_limit, _) = gix_features::parallel::optimize_chunk_size_and_thread_limit(
100,
index.entries().len().into(),
options.thread_limit,
None,
);
let (entries, path_backing) = index.entries_mut_and_pathbacking();
in_parallel_if(
|| true, // TODO: heuristic: when is parallelization not worth it?
entries.chunks_mut(chunk_size),
thread_limit,
{
let options = &options;
move |_| {
(
State {
buf: Vec::new(),
odb_buf: Vec::new(),
timestamp,
path_backing,
worktree,
options,
},
compare.clone(),
find.clone(),
)
}
},
|entries, (state, diff, find)| {
entries
.iter_mut()
.filter_map(|entry| state.process(entry, diff, find))
.collect()
},
ReduceChange {
collector,
phantom: PhantomData,
},
)
}
struct State<'a, 'b> {
buf: Vec<u8>,
odb_buf: Vec<u8>,
timestamp: FileTime,
// path_cache: fs::Cache TODO path cache
path_backing: &'b [u8],
worktree: &'a Path,
options: &'a Options,
}
type StatusResult<'index, T> = Result<(&'index gix_index::Entry, &'index BStr, Option<Change<T>>, bool), Error>;
impl<'index> State<'_, 'index> {
fn process<T, Find, E>(
&mut self,
entry: &'index mut gix_index::Entry,
diff: &mut impl CompareBlobs<Output = T>,
find: &mut Find,
) -> Option<StatusResult<'index, T>>
where
E: std::error::Error + Send + Sync + 'static,
Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E> + Send + Clone,
{
let conflict = match entry.stage() {
0 => false,
1 => true,
_ => return None,
};
if entry.flags.intersects(
gix_index::entry::Flags::UPTODATE
| gix_index::entry::Flags::SKIP_WORKTREE
| gix_index::entry::Flags::ASSUME_VALID
| gix_index::entry::Flags::FSMONITOR_VALID,
) {
return None;
}
let path = entry.path_in(self.path_backing);
let status = self.compute_status(&mut *entry, path, diff, find);
Some(status.map(move |status| (&*entry, path, status, conflict)))
}
/// # On how racy-git is handled here
///
/// Basically the racy detection is a safety mechanism that ensures we can always just compare the stat
/// information between index and worktree and if they match we don't need to look at the content.
/// This usually just works but if a file updates quickly we could run into the following situation:
///
/// * save file version `A` from disk into worktree (git add)
/// * file is changed so fast that the mtime doesn't change - *we only looks at seconds by default*
/// * file contents change but file-size stays the same, so `"foo" -> "bar"` has the same size but different content
///
/// Now both `mtime` and `size`, and all other stat information, is the same but the file has actually changed.
/// This case is called *racily clean*. *The file should show up as changed but due to a data race it doesn't.*
/// This is the racy git problem.
///
/// To solve this we do the following trick: Whenever we modify the index, which includes `git status`, we save the
/// current timestamp before the modification starts. This timestamp fundamentally represents a checkpoint of sorts.
/// We "promise" ourselves that after the modification finishes all entries modified before this timestamp have the
/// racy git problem resolved.
///
/// So now when we modify the index we must resolve the racy git problem somehow. To do that we only need to look at
/// unchanged entries. Changed entries are not interesting since they are already showing up as changed anyway so there
/// isn't really a race-condition to worry about. This also explains why removing the `return` here doesn't have an apparent effect.
/// This entire branch here is just the optimization of "don't even look at index entries where the stat hasn't changed".
/// If we don't have this optimization the result shouldn't change, our status implementation will just be super slow :D
/// We calculate whether this change is `racy_clean`, so if the last `timestamp` is before or the same as the `mtime` of the entry
/// which is what `new_stat.is_racy(..)` does in the branch, and only if we are sure that there is no race condition
/// do we `return` early. Since we don't `return` early we just do a full content comparison below,
/// which always yields the correct result, there is no race condition there.
///
/// If a file showed up as racily clean and didn't change then we don't need to do anything. After this status check is
/// complete and the file won't show up as racily clean anymore, since it's mtime is now before the new timestamp.
/// However if the file did actually change then we really ran into one of those rare race conditions in that case we,
/// and git does the same, set the size of the file in the index to 0. This will always make the file show up as changed.
/// This adds the need to treat all files of size 0 in the index as changed. This is not quite right of course because 0 sized files
/// could be entirely valid and unchanged. Therefore this only applies if the oid doesn't match the oid of an empty file,
/// which is a constant.
///
/// Adapted from [here](https://github.com/Byron/gitoxide/pull/805#discussion_r1164676777).
fn compute_status<T, Find, E>(
&mut self,
entry: &mut gix_index::Entry,
git_path: &BStr,
diff: &mut impl CompareBlobs<Output = T>,
find: &mut Find,
) -> Result<Option<Change<T>>, Error>
where
E: std::error::Error + Send + Sync + 'static,
Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E> + Send + Clone,
{
// TODO fs cache
let worktree_path = gix_path::try_from_bstr(git_path).map_err(|_| Error::IllformedUtf8)?;
let worktree_path = self.worktree.join(worktree_path);
let metadata = match worktree_path.symlink_metadata() {
// TODO: check if any parent directory is a symlink
// we need to use fs::Cache for that
Ok(metadata) if metadata.is_dir() => {
// index entries are normally only for files/symlinks
// if a file turned into a directory it was removed
// the only exception here are submodules which are
// part of the index despite being directories
//
// TODO: submodules:
// if entry.mode.contains(Mode::COMMIT) &&
// resolve_gitlink_ref(ce->name, "HEAD", &sub))
return Ok(Some(Change::Removed));
}
Ok(metadata) => metadata,
Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(Some(Change::Removed)),
Err(err) => {
return Err(err.into());
}
};
if entry.flags.contains(gix_index::entry::Flags::INTENT_TO_ADD) {
return Ok(Some(Change::IntentToAdd));
}
let new_stat = gix_index::entry::Stat::from_fs(&metadata)?;
let executable_bit_changed =
match entry
.mode
.change_to_match_fs(&metadata, self.options.fs.symlink, self.options.fs.executable_bit)
{
Some(gix_index::entry::mode::Change::Type { .. }) => return Ok(Some(Change::Type)),
Some(gix_index::entry::mode::Change::ExecutableBit) => true,
None => false,
};
// Here we implement racy-git. See racy-git.txt in the git documentation for a detailed documentation.
//
// A file is racy if:
// 1. its `mtime` is at or after the last index timestamp and its entry stat information
// matches the on-disk file but the file contents are actually modified
// 2. it's size is 0 (set after detecting a file was racy previously)
//
// The first case is detected below by checking the timestamp if the file is marked unmodified.
// The second case is usually detected either because the on-disk file is not empty, hence
// the basic stat match fails, or by checking whether the size doesn't fit the oid.
let mut racy_clean = false;
if !executable_bit_changed
&& new_stat.matches(&entry.stat, self.options.stat)
// TODO: find a test for the following line or remove it. Is this more often hit with smudge/clean filters?
&& (!entry.id.is_empty_blob() || entry.stat.size == 0)
{
racy_clean = new_stat.is_racy(self.timestamp, self.options.stat);
if !racy_clean {
return Ok(None);
}
}
let read_file = WorktreeBlob {
buf: &mut self.buf,
path: &worktree_path,
entry,
options: self.options,
};
let read_blob = OdbBlob {
buf: &mut self.odb_buf,
id: &entry.id,
find,
};
let content_change = diff.compare_blobs::<Error>(entry, metadata.len() as usize, read_file, read_blob)?;
// This file is racy clean! Set the size to 0 so we keep detecting this as the file is updated.
if content_change.is_some() && racy_clean {
entry.stat.size = 0;
}
if content_change.is_some() || executable_bit_changed {
Ok(Some(Change::Modification {
executable_bit_changed,
content_change,
}))
} else {
// don't diff against this file next time since we know the file is unchanged.
entry.stat = new_stat;
Ok(None)
}
}
}
struct ReduceChange<'a, 'index, T: VisitEntry<'index>> {
collector: &'a mut T,
phantom: PhantomData<fn(&'index ())>,
}
impl<'index, T, C: VisitEntry<'index, ContentChange = T>> Reduce for ReduceChange<'_, 'index, C> {
type Input = Vec<StatusResult<'index, T>>;
type FeedProduce = ();
type Output = ();
type Error = Error;
fn feed(&mut self, items: Self::Input) -> Result<Self::FeedProduce, Self::Error> {
for item in items {
let (entry, path, change, conflict) = item?;
self.collector.visit_entry(entry, path, change, conflict);
}
Ok(())
}
fn finalize(self) -> Result<Self::Output, Self::Error> {
Ok(())
}
}
struct WorktreeBlob<'a> {
buf: &'a mut Vec<u8>,
path: &'a Path,
entry: &'a gix_index::Entry,
options: &'a Options,
}
struct OdbBlob<'a, Find, E>
where
E: std::error::Error + Send + Sync + 'static,
Find: FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E>,
{
buf: &'a mut Vec<u8>,
id: &'a gix_hash::oid,
find: Find,
}
impl<'a> content::ReadDataOnce<'a, Error> for WorktreeBlob<'a> {
fn read_data(self) -> Result<&'a [u8], Error> {
let res = read::data_to_buf_with_meta(
self.path,
self.buf,
self.entry.mode == gix_index::entry::Mode::SYMLINK,
&self.options.fs,
)?;
Ok(res)
}
}
impl<'a, Find, E> content::ReadDataOnce<'a, Error> for OdbBlob<'a, Find, E>
where
E: std::error::Error + Send + Sync + 'static,
Find: FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E>,
{
fn read_data(mut self) -> Result<&'a [u8], Error> {
(self.find)(self.id, self.buf)
.map(|b| b.data)
.map_err(move |err| Error::Find(Box::new(err)))
}
}