liboxen 0.9.9-alpha

Oxen is a fast, unstructured data version control, to help version datasets, written in Rust.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
use std::collections::HashSet;
use std::path::PathBuf;

use serde::{Deserialize, Serialize};

use crate::core::index::{CommitDirEntryReader, CommitEntryReader};
use crate::error::OxenError;
use crate::model::diff::dir_diff_summary::DirDiffSummaryImpl;
use crate::model::{Commit, EntryDataType, MetadataEntry};
use crate::opts::DFOpts;
use crate::view::compare::AddRemoveModifyCounts;
use crate::view::entry::ResourceVersion;
use crate::{
    api,
    model::{CommitEntry, LocalRepository},
    util,
};

use super::diff_entry_status::DiffEntryStatus;
use super::dir_diff_summary::DirDiffSummary;
use super::generic_diff::GenericDiff;
use super::generic_diff_summary::GenericDiffSummary;
use super::tabular_diff::TabularDiff;
use super::tabular_diff_summary::TabularDiffWrapper;

#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct DiffEntry {
    pub status: String,
    pub data_type: EntryDataType,
    pub filename: String,
    pub is_dir: bool,
    pub size: u64,

    // Resource
    pub head_resource: Option<ResourceVersion>,
    pub base_resource: Option<ResourceVersion>,

    // Entry
    pub head_entry: Option<MetadataEntry>,
    pub base_entry: Option<MetadataEntry>,

    // Diff summary
    pub diff_summary: Option<GenericDiffSummary>,

    // Full Diff (only exposed sometimes for performance reasons)
    pub diff: Option<GenericDiff>,
}

impl DiffEntry {
    pub fn has_changes(&self) -> bool {
        // TODO: do a deeper check than size, but this is good for MVP
        match (&self.head_entry, &self.base_entry) {
            (Some(head), Some(base)) => head.size != base.size,
            _ => false,
        }
    }

    pub fn from_dir(
        repo: &LocalRepository,
        base_dir: Option<&PathBuf>,
        base_commit: &Commit,
        head_dir: Option<&PathBuf>,
        head_commit: &Commit,
        status: DiffEntryStatus,
    ) -> Result<DiffEntry, OxenError> {
        // Get the metadata entries
        let mut base_entry = DiffEntry::metadata_from_dir(repo, base_dir, base_commit);
        let mut head_entry = DiffEntry::metadata_from_dir(repo, head_dir, head_commit);

        // Need to check whether we have the head or base entry to check data about the file
        let (current_dir, current_entry) = if let Some(dir) = head_dir {
            (dir, head_entry.to_owned().unwrap())
        } else {
            (base_dir.unwrap(), base_entry.to_owned().unwrap())
        };

        let diff_summary = DiffEntry::diff_summary_from_dir(repo, &base_entry, &head_entry)?;
        let head_resource = DiffEntry::resource_from_dir(head_dir, head_commit);
        let base_resource = DiffEntry::resource_from_dir(base_dir, base_commit);

        if base_entry.is_some() {
            base_entry.as_mut().unwrap().resource = base_resource.clone();
        }

        if head_entry.is_some() {
            head_entry.as_mut().unwrap().resource = head_resource.clone();
        }

        Ok(DiffEntry {
            status: status.to_string(),
            data_type: EntryDataType::Dir,
            filename: current_dir.as_os_str().to_str().unwrap().to_string(),
            is_dir: true,
            size: current_entry.size,
            head_resource,
            base_resource,
            head_entry,
            base_entry,
            diff_summary,
            diff: None, // TODO: Come back to what we want a full directory diff to look like
        })
    }

    #[allow(clippy::too_many_arguments)]
    pub fn from_commit_entry(
        repo: &LocalRepository,
        base_entry: Option<CommitEntry>,
        base_commit: &Commit, // pass in commit objects for speed so we don't have to lookup later
        head_entry: Option<CommitEntry>,
        head_commit: &Commit,
        status: DiffEntryStatus,
        should_do_full_diff: bool,
        df_opts: Option<DFOpts>, // only for tabular
    ) -> DiffEntry {
        // Need to check whether we have the head or base entry to check data about the file
        let (current_entry, version_path) = if let Some(entry) = &head_entry {
            (entry.clone(), util::fs::version_path(repo, entry))
        } else {
            (
                base_entry.clone().unwrap(),
                util::fs::version_path(repo, &base_entry.clone().unwrap()),
            )
        };
        let data_type = util::fs::file_data_type(&version_path);

        let base_resource = DiffEntry::resource_from_entry(base_entry.clone());
        let head_resource = DiffEntry::resource_from_entry(head_entry.clone());

        let mut base_meta_entry =
            MetadataEntry::from_commit_entry(repo, base_entry.clone(), base_commit);
        let mut head_meta_entry =
            MetadataEntry::from_commit_entry(repo, head_entry.clone(), head_commit);

        if base_entry.is_some() {
            base_meta_entry.as_mut().unwrap().resource = base_resource.clone();
        }

        if head_entry.is_some() {
            head_meta_entry.as_mut().unwrap().resource = head_resource.clone();
        }

        // TODO: Clean this up, but want to get a prototype to work first
        // if tabular, and should_do_full_diff
        //     do full diff
        // log::debug!(
        //     "checking if should do full diff for tabular {},{},{}",
        //     data_type,
        //     should_do_full_diff,
        //     pagination.is_some()
        // );
        if let Some(df_opts) = df_opts {
            if data_type == EntryDataType::Tabular && should_do_full_diff {
                let diff =
                    TabularDiff::from_commit_entries(repo, &base_entry, &head_entry, df_opts);
                return DiffEntry {
                    status: status.to_string(),
                    data_type: data_type.clone(),
                    filename: current_entry.path.as_os_str().to_str().unwrap().to_string(),
                    is_dir: false,
                    size: current_entry.num_bytes,
                    head_resource,
                    base_resource,
                    head_entry: head_meta_entry,
                    base_entry: base_meta_entry,
                    diff_summary: Some(GenericDiffSummary::TabularDiffWrapper(
                        diff.clone().tabular.summary.to_wrapper(),
                    )),
                    diff: Some(GenericDiff::TabularDiff(diff)),
                };
            }
        }

        DiffEntry {
            status: status.to_string(),
            data_type: data_type.clone(),
            filename: current_entry.path.as_os_str().to_str().unwrap().to_string(),
            is_dir: false,
            size: current_entry.num_bytes,
            head_resource,
            base_resource,
            head_entry: head_meta_entry,
            base_entry: base_meta_entry,
            diff_summary: DiffEntry::diff_summary_from_file(
                repo,
                data_type,
                &base_entry,
                &head_entry,
            ),
            diff: None, // TODO: other full diffs...
        }
    }

    fn resource_from_entry(entry: Option<CommitEntry>) -> Option<ResourceVersion> {
        entry.map(|entry| ResourceVersion {
            version: entry.commit_id.to_string(),
            path: entry.path.as_os_str().to_str().unwrap().to_string(),
        })
    }

    fn resource_from_dir(dir: Option<&PathBuf>, commit: &Commit) -> Option<ResourceVersion> {
        dir.map(|dir| ResourceVersion {
            version: commit.id.to_string(),
            path: dir.as_os_str().to_str().unwrap().to_string(),
        })
    }

    fn metadata_from_dir(
        repo: &LocalRepository,
        dir: Option<&PathBuf>,
        commit: &Commit,
    ) -> Option<MetadataEntry> {
        if let Some(dir) = dir {
            match api::local::entries::get_meta_entry(repo, commit, dir) {
                Ok(entry) => Some(entry),
                Err(_) => None,
            }
        } else {
            None
        }
    }

    fn diff_summary_from_dir(
        repo: &LocalRepository,
        base_dir: &Option<MetadataEntry>,
        head_dir: &Option<MetadataEntry>,
    ) -> Result<Option<GenericDiffSummary>, OxenError> {
        log::debug!("diff_summary_from_dir base_dir: {:?}", base_dir);
        log::debug!("diff_summary_from_dir head_dir: {:?}", head_dir);

        // if both base_dir and head_dir are none, then there is no diff summary
        if base_dir.is_none() && head_dir.is_none() {
            return Ok(None);
        }

        // if base_dir is some and head_dir is none, then we deleted all the files
        if base_dir.is_some() && head_dir.is_none() {
            return DiffEntry::r_compute_removed_files(repo, base_dir.as_ref().unwrap());
        }

        // if head_dir is some and base_dir is none, then we added all the files
        if head_dir.is_some() && base_dir.is_none() {
            return DiffEntry::r_compute_added_files(repo, head_dir.as_ref().unwrap());
        }

        // if both base_dir and head_dir are some, then we need to compare the two
        let base_dir = base_dir.as_ref().unwrap();
        let head_dir = head_dir.as_ref().unwrap();

        DiffEntry::r_compute_diff_all_files(repo, base_dir, head_dir)
    }

    fn r_compute_diff_all_files(
        repo: &LocalRepository,
        base_dir: &MetadataEntry,
        head_dir: &MetadataEntry,
    ) -> Result<Option<GenericDiffSummary>, OxenError> {
        let base_commit_id = &base_dir.latest_commit.as_ref().unwrap().id;
        let head_commit_id = &head_dir.latest_commit.as_ref().unwrap().id;

        // base and head path will be the same so just choose base
        let path = PathBuf::from(&base_dir.resource.clone().unwrap().path);

        let mut num_removed = 0;
        let mut num_added = 0;
        let mut num_modified = 0;

        // Find all the children of the dir and sum up their counts
        let commit_entry_reader = CommitEntryReader::new_from_commit_id(repo, base_commit_id)?;
        let mut dirs = commit_entry_reader.list_dir_children(&path)?;

        let commit_entry_reader = CommitEntryReader::new_from_commit_id(repo, head_commit_id)?;
        let mut other = commit_entry_reader.list_dir_children(&path)?;
        dirs.append(&mut other);
        dirs.push(path.clone());

        // Uniq them
        let dirs: HashSet<PathBuf> = HashSet::from_iter(dirs);

        // What base_commit_id and head_commit_id are happening here?
        log::debug!("base_commit_id 284 is {:?}", base_commit_id);
        log::debug!("head_commit_id 285 is {:?}", head_commit_id);

        for dir in dirs {
            let base_dir_reader = CommitDirEntryReader::new(repo, base_commit_id, &dir)?;
            let head_dir_reader = CommitDirEntryReader::new(repo, head_commit_id, &dir)?;

            // List the entries in hash sets
            let head_entries = head_dir_reader.list_entries_set()?;
            let base_entries = base_dir_reader.list_entries_set()?;
            log::debug!(
                "diff_summary_from_dir head_entries: {:?}",
                head_entries.len()
            );
            log::debug!(
                "diff_summary_from_dir base_entries: {:?}",
                base_entries.len()
            );

            // Find the added entries
            let added_entries = head_entries
                .difference(&base_entries)
                .collect::<HashSet<_>>();
            num_added += added_entries.len();

            // Find the removed entries
            let removed_entries = base_entries
                .difference(&head_entries)
                .collect::<HashSet<_>>();
            num_removed += removed_entries.len();

            // Find the modified entries
            for base_entry in base_entries {
                if let Some(head_entry) = head_entries.get(&base_entry) {
                    if head_entry.hash != base_entry.hash {
                        num_modified += 1;
                    }
                }
            }
        }

        Ok(Some(GenericDiffSummary::DirDiffSummary(DirDiffSummary {
            dir: DirDiffSummaryImpl {
                file_counts: AddRemoveModifyCounts {
                    added: num_added,
                    removed: num_removed,
                    modified: num_modified,
                },
            },
        })))
    }

    fn r_compute_removed_files(
        repo: &LocalRepository,
        base_dir: &MetadataEntry,
    ) -> Result<Option<GenericDiffSummary>, OxenError> {
        let commit_id = &base_dir.latest_commit.as_ref().unwrap().id;
        let path = PathBuf::from(&base_dir.resource.clone().unwrap().path);
        log::debug!("r_compute_removed_files base_dir: {:?}", path);

        // Count all removals in the directory and its children
        let commit_entry_reader = CommitEntryReader::new_from_commit_id(repo, commit_id)?;
        let mut dirs = commit_entry_reader.list_dir_children(&path)?;
        dirs.push(path);

        log::debug!("r_compute_removed_files got dirs: {:?}", dirs.len());

        let mut num_removed = 0;
        for dir in dirs {
            let dir_reader = CommitDirEntryReader::new(repo, commit_id, &dir)?;
            let count = dir_reader.num_entries();
            log::debug!("r_compute_removed_files dir: {:?} count: {}", dir, count);

            num_removed += count;
        }

        Ok(Some(GenericDiffSummary::DirDiffSummary(DirDiffSummary {
            dir: DirDiffSummaryImpl {
                file_counts: AddRemoveModifyCounts {
                    added: 0,
                    removed: num_removed,
                    modified: 0,
                },
            },
        })))
    }

    fn r_compute_added_files(
        repo: &LocalRepository,
        head_dir: &MetadataEntry,
    ) -> Result<Option<GenericDiffSummary>, OxenError> {
        let commit_id = &head_dir.latest_commit.as_ref().unwrap().id;
        let path = PathBuf::from(&head_dir.resource.clone().unwrap().path);
        log::debug!("r_compute_added_files base_dir: {:?}", path);

        // Count all removals in the directory and its children
        let commit_entry_reader = CommitEntryReader::new_from_commit_id(repo, commit_id)?;
        let mut dirs = commit_entry_reader.list_dir_children(&path)?;
        dirs.push(path);

        log::debug!("r_compute_added_files got dirs: {:?}", dirs.len());

        let mut num_added = 0;
        for dir in dirs {
            let dir_reader = CommitDirEntryReader::new(repo, commit_id, &dir)?;
            let count = dir_reader.num_entries();
            log::debug!("r_compute_added_files dir: {:?} count: {}", dir, count);

            num_added += count;
        }

        Ok(Some(GenericDiffSummary::DirDiffSummary(DirDiffSummary {
            dir: DirDiffSummaryImpl {
                file_counts: AddRemoveModifyCounts {
                    added: num_added,
                    removed: 0,
                    modified: 0,
                },
            },
        })))
    }

    fn diff_summary_from_file(
        repo: &LocalRepository,
        data_type: EntryDataType,
        base_entry: &Option<CommitEntry>,
        head_entry: &Option<CommitEntry>,
    ) -> Option<GenericDiffSummary> {
        // TODO match on type, and create the appropriate summary
        match data_type {
            EntryDataType::Tabular => Some(GenericDiffSummary::TabularDiffWrapper(
                TabularDiffWrapper::from_commit_entries(repo, base_entry, head_entry),
            )),
            _ => None,
        }
    }
}