1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
use std::{io, marker::PhantomData, path::Path};

use bstr::BStr;
use filetime::FileTime;
use gix_features::parallel::{in_parallel_if, Reduce};

use crate::{
    read,
    status::{
        content,
        content::CompareBlobs,
        types::{Error, Options},
        Change, VisitEntry,
    },
};

/// Calculates the changes that need to be applied to an `index` to match the state of the `worktree` and makes them
/// observable in `collector`, along with information produced by `compare` which gets to see blobs that may have changes.
/// `options` are used to configure the operation.
///
/// Note that `index` is updated with the latest seen stat information from the worktree, and its timestamp is adjusted to
/// the current time for which it will be considered fresh.
///
/// Note that this isn't technically quite what this function does as this also provides some additional information,
/// like whether a file has conflicts, and files that were added with `git add` are shown as a special
/// changes despite not technically requiring a change to the index since `git add` already added the file to the index.
pub fn status<'index, T, Find, E>(
    index: &'index mut gix_index::State,
    worktree: &Path,
    collector: &mut impl VisitEntry<'index, ContentChange = T>,
    compare: impl CompareBlobs<Output = T> + Send + Clone,
    find: Find,
    options: Options,
) -> Result<(), Error>
where
    T: Send,
    E: std::error::Error + Send + Sync + 'static,
    Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E> + Send + Clone,
{
    // the order is absolutely critical here we use the old timestamp to detect racy index entries
    // (modified at or after the last index update) during the index update we then set those
    // entries size to 0 (see below) to ensure they keep showing up as racy and reset the timestamp.
    let timestamp = index.timestamp();
    index.set_timestamp(FileTime::now());
    let (chunk_size, thread_limit, _) = gix_features::parallel::optimize_chunk_size_and_thread_limit(
        100,
        index.entries().len().into(),
        options.thread_limit,
        None,
    );
    let (entries, path_backing) = index.entries_mut_and_pathbacking();
    in_parallel_if(
        || true, // TODO: heuristic: when is parallelization not worth it?
        entries.chunks_mut(chunk_size),
        thread_limit,
        {
            let options = &options;
            move |_| {
                (
                    State {
                        buf: Vec::new(),
                        odb_buf: Vec::new(),
                        timestamp,
                        path_backing,
                        worktree,
                        options,
                    },
                    compare.clone(),
                    find.clone(),
                )
            }
        },
        |entries, (state, diff, find)| {
            entries
                .iter_mut()
                .filter_map(|entry| state.process(entry, diff, find))
                .collect()
        },
        ReduceChange {
            collector,
            phantom: PhantomData,
        },
    )
}

struct State<'a, 'b> {
    buf: Vec<u8>,
    odb_buf: Vec<u8>,
    timestamp: FileTime,
    // path_cache: fs::Cache TODO path cache
    path_backing: &'b [u8],
    worktree: &'a Path,
    options: &'a Options,
}

type StatusResult<'index, T> = Result<(&'index gix_index::Entry, &'index BStr, Option<Change<T>>, bool), Error>;

impl<'index> State<'_, 'index> {
    fn process<T, Find, E>(
        &mut self,
        entry: &'index mut gix_index::Entry,
        diff: &mut impl CompareBlobs<Output = T>,
        find: &mut Find,
    ) -> Option<StatusResult<'index, T>>
    where
        E: std::error::Error + Send + Sync + 'static,
        Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E> + Send + Clone,
    {
        let conflict = match entry.stage() {
            0 => false,
            1 => true,
            _ => return None,
        };
        if entry.flags.intersects(
            gix_index::entry::Flags::UPTODATE
                | gix_index::entry::Flags::SKIP_WORKTREE
                | gix_index::entry::Flags::ASSUME_VALID
                | gix_index::entry::Flags::FSMONITOR_VALID,
        ) {
            return None;
        }
        let path = entry.path_in(self.path_backing);
        let status = self.compute_status(&mut *entry, path, diff, find);
        Some(status.map(move |status| (&*entry, path, status, conflict)))
    }

    /// # On how racy-git is handled here
    ///
    /// Basically the racy detection is a safety mechanism that ensures we can always just compare the stat
    /// information between index and worktree and if they match we don't need to look at the content.
    /// This usually just works but if a file updates quickly we could run into the following situation:
    ///
    /// * save file version `A` from disk into worktree (git add)
    /// * file is changed so fast that the mtime doesn't change - *we only looks at seconds by default*
    /// * file contents change but file-size stays the same, so `"foo" -> "bar"` has the same size but different content
    ///
    /// Now both `mtime` and `size`, and all other stat information, is the same but the file has actually changed.
    /// This case is called *racily clean*. *The file should show up as changed but due to a data race it doesn't.*
    /// This is the racy git problem.
    ///
    /// To solve this we do the following trick: Whenever we modify the index, which includes `git status`, we save the
    /// current timestamp before the modification starts. This timestamp fundamentally represents a checkpoint of sorts.
    /// We "promise" ourselves that after the modification finishes all entries modified before this timestamp have the
    /// racy git problem resolved.
    ///
    /// So now when we modify the index we must resolve the racy git problem somehow. To do that we only need to look at
    /// unchanged entries. Changed entries are not interesting since they are already showing up as changed anyway so there
    /// isn't really a race-condition to worry about. This also explains why removing the `return` here doesn't have an apparent effect.
    /// This entire branch here is just the optimization of "don't even look at index entries where the stat hasn't changed".
    /// If we don't have this optimization the result shouldn't change, our status implementation will just be super slow :D

    /// We calculate whether this change is `racy_clean`, so if the last `timestamp` is before or the same as the `mtime` of the entry
    /// which is what `new_stat.is_racy(..)` does in the branch, and only if we are sure that there is no race condition
    /// do we `return` early. Since we don't `return` early we just do a full content comparison below,
    /// which always yields the correct result, there is no race condition there.
    ///
    /// If a file showed up as racily clean and didn't change then we don't need to do anything. After this status check is
    /// complete and the file won't show up as racily clean anymore, since it's mtime is now before the new timestamp.
    /// However if the file did actually change then we really ran into one of those rare race conditions in that case we,
    /// and git does the same, set the size of the file in the index to 0. This will always make the file show up as changed.
    /// This adds the need to treat all files of size 0 in the index as changed. This is not quite right of course because 0 sized files
    /// could be entirely valid and unchanged. Therefore this only applies if the oid doesn't match the oid of an empty file,
    /// which is a constant.
    ///
    /// Adapted from [here](https://github.com/Byron/gitoxide/pull/805#discussion_r1164676777).
    fn compute_status<T, Find, E>(
        &mut self,
        entry: &mut gix_index::Entry,
        git_path: &BStr,
        diff: &mut impl CompareBlobs<Output = T>,
        find: &mut Find,
    ) -> Result<Option<Change<T>>, Error>
    where
        E: std::error::Error + Send + Sync + 'static,
        Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E> + Send + Clone,
    {
        // TODO fs cache
        let worktree_path = gix_path::try_from_bstr(git_path).map_err(|_| Error::IllformedUtf8)?;
        let worktree_path = self.worktree.join(worktree_path);
        let metadata = match worktree_path.symlink_metadata() {
            // TODO: check if any parent directory is a symlink
            //       we need to use fs::Cache for that
            Ok(metadata) if metadata.is_dir() => {
                // index entries are normally only for files/symlinks
                // if a file turned into a directory it was removed
                // the only exception here are submodules which are
                // part of the index despite being directories
                //
                // TODO: submodules:
                //   if entry.mode.contains(Mode::COMMIT) &&
                //     resolve_gitlink_ref(ce->name, "HEAD", &sub))
                return Ok(Some(Change::Removed));
            }
            Ok(metadata) => metadata,
            Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(Some(Change::Removed)),
            Err(err) => {
                return Err(err.into());
            }
        };
        if entry.flags.contains(gix_index::entry::Flags::INTENT_TO_ADD) {
            return Ok(Some(Change::IntentToAdd));
        }
        let new_stat = gix_index::entry::Stat::from_fs(&metadata)?;
        let executable_bit_changed =
            match entry
                .mode
                .change_to_match_fs(&metadata, self.options.fs.symlink, self.options.fs.executable_bit)
            {
                Some(gix_index::entry::mode::Change::Type { .. }) => return Ok(Some(Change::Type)),
                Some(gix_index::entry::mode::Change::ExecutableBit) => true,
                None => false,
            };

        // Here we implement racy-git. See racy-git.txt in the git documentation for a detailed documentation.
        //
        // A file is racy if:
        // 1. its `mtime` is at or after the last index timestamp and its entry stat information
        //   matches the on-disk file but the file contents are actually modified
        // 2. it's size is 0 (set after detecting a file was racy previously)
        //
        // The first case is detected below by checking the timestamp if the file is marked unmodified.
        // The second case is usually detected either because the on-disk file is not empty, hence
        // the basic stat match fails, or by checking whether the size doesn't fit the oid.
        let mut racy_clean = false;
        if !executable_bit_changed
            && new_stat.matches(&entry.stat, self.options.stat)
            // TODO: find a test for the following line or remove it. Is this more often hit with smudge/clean filters?
            && (!entry.id.is_empty_blob() || entry.stat.size == 0)
        {
            racy_clean = new_stat.is_racy(self.timestamp, self.options.stat);
            if !racy_clean {
                return Ok(None);
            }
        }

        let read_file = WorktreeBlob {
            buf: &mut self.buf,
            path: &worktree_path,
            entry,
            options: self.options,
        };
        let read_blob = OdbBlob {
            buf: &mut self.odb_buf,
            id: &entry.id,
            find,
        };
        let content_change = diff.compare_blobs::<Error>(entry, metadata.len() as usize, read_file, read_blob)?;
        // This file is racy clean! Set the size to 0 so we keep detecting this as the file is updated.
        if content_change.is_some() && racy_clean {
            entry.stat.size = 0;
        }
        if content_change.is_some() || executable_bit_changed {
            Ok(Some(Change::Modification {
                executable_bit_changed,
                content_change,
            }))
        } else {
            // don't diff against this file next time since we know the file is unchanged.
            entry.stat = new_stat;
            Ok(None)
        }
    }
}

struct ReduceChange<'a, 'index, T: VisitEntry<'index>> {
    collector: &'a mut T,
    phantom: PhantomData<fn(&'index ())>,
}

impl<'index, T, C: VisitEntry<'index, ContentChange = T>> Reduce for ReduceChange<'_, 'index, C> {
    type Input = Vec<StatusResult<'index, T>>;

    type FeedProduce = ();

    type Output = ();

    type Error = Error;

    fn feed(&mut self, items: Self::Input) -> Result<Self::FeedProduce, Self::Error> {
        for item in items {
            let (entry, path, change, conflict) = item?;
            self.collector.visit_entry(entry, path, change, conflict);
        }
        Ok(())
    }

    fn finalize(self) -> Result<Self::Output, Self::Error> {
        Ok(())
    }
}

struct WorktreeBlob<'a> {
    buf: &'a mut Vec<u8>,
    path: &'a Path,
    entry: &'a gix_index::Entry,
    options: &'a Options,
}

struct OdbBlob<'a, Find, E>
where
    E: std::error::Error + Send + Sync + 'static,
    Find: FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E>,
{
    buf: &'a mut Vec<u8>,
    id: &'a gix_hash::oid,
    find: Find,
}

impl<'a> content::ReadDataOnce<'a, Error> for WorktreeBlob<'a> {
    fn read_data(self) -> Result<&'a [u8], Error> {
        let res = read::data_to_buf_with_meta(
            self.path,
            self.buf,
            self.entry.mode == gix_index::entry::Mode::SYMLINK,
            &self.options.fs,
        )?;
        Ok(res)
    }
}

impl<'a, Find, E> content::ReadDataOnce<'a, Error> for OdbBlob<'a, Find, E>
where
    E: std::error::Error + Send + Sync + 'static,
    Find: FnMut(&gix_hash::oid, &'a mut Vec<u8>) -> Result<gix_object::BlobRef<'a>, E>,
{
    fn read_data(mut self) -> Result<&'a [u8], Error> {
        (self.find)(self.id, self.buf)
            .map(|b| b.data)
            .map_err(move |err| Error::Find(Box::new(err)))
    }
}