xvc_file/common/
mod.rs

1//! Common operations for xvc file
2pub mod compare;
3pub mod gitignore;
4
5use std::collections::{HashMap, HashSet};
6use std::fs::{self};
7
8use std::{
9    fs::Metadata,
10    path::{Path, PathBuf},
11};
12
13#[cfg(unix)]
14use std::os::unix::fs::PermissionsExt;
15
16use crate::common::gitignore::IgnoreOperation;
17use crate::error::{Error, Result};
18use crossbeam_channel::{Receiver, Sender};
19use derive_more::{AsRef, Deref, Display, From, FromStr};
20use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
21use serde::{Deserialize, Serialize};
22use xvc_core::EventLog;
23use xvc_core::{
24    all_paths_and_metadata, apply_diff, conf, error, get_absolute_git_command,
25    get_git_tracked_files, info, persist,
26    types::xvcpath::XvcCachePath,
27    util::{file::make_symlink, xvcignore::COMMON_IGNORE_PATTERNS},
28    uwr, warn, AbsolutePath, ContentDigest, DiffStore, FromConfigKey, Glob, HStore, HashAlgorithm,
29    PathSync, RecheckMethod, Storable, TextOrBinary, XvcFileType, XvcMetadata, XvcOutputSender,
30    XvcPath, XvcPathMetadataMap, XvcRoot, XvcStore,
31};
32use xvc_core::{path_metadata_map_from_file_targets, XvcWalkerError};
33
34use self::gitignore::IgnoreOp;
35
36/// Represents whether a file is a text file or not. We wrap [TextOrBinary] to specify [persist!] and [conf!].
37#[derive(
38    Debug,
39    Clone,
40    PartialEq,
41    Eq,
42    PartialOrd,
43    Ord,
44    Serialize,
45    Deserialize,
46    Hash,
47    Display,
48    FromStr,
49    From,
50    AsRef,
51    Deref,
52    Copy,
53    Default,
54)]
55pub struct FileTextOrBinary(TextOrBinary);
56conf!(FileTextOrBinary, "file.track.text_or_binary");
57persist!(FileTextOrBinary, "file-text-or-binary");
58
59impl FileTextOrBinary {
60    /// Returns the inner TextOrBinary
61    pub fn as_inner(&self) -> TextOrBinary {
62        self.0
63    }
64}
65
66/// Receives path and metadata and sends content digests of the sent paths.
67pub fn pipe_path_digest(
68    receiver: Receiver<(PathBuf, Metadata)>,
69    sender: Sender<(PathBuf, ContentDigest)>,
70    algorithm: HashAlgorithm,
71    text_or_binary: TextOrBinary,
72) -> Result<()> {
73    while let Ok((p, _)) = receiver.try_recv() {
74        let digest = ContentDigest::new(&p, algorithm, text_or_binary);
75        match digest {
76            Ok(digest) => {
77                let _ = sender.send((p, digest));
78            }
79            Err(err) => {
80                log::warn!("{:?}", err);
81            }
82        }
83    }
84    Ok(())
85}
86
87/// This is to convert targets given in the CLI to XvcPaths. It doesn't walk the
88/// file system. It's to be used in `xvc file carry-in` or `xvc file recheck`,
89/// where we already track the files in the store.
90///
91/// Just loads the stores, compiles targets as globs and checks
92/// which paths in the store matches. If the matches contain directories, all their
93/// children are also selected.
94///
95/// If `targets` is `None`, all paths in the store are returned.
96pub fn load_targets_from_store(
97    output_snd: &XvcOutputSender,
98    xvc_root: &XvcRoot,
99    current_dir: &AbsolutePath,
100    targets: &Option<Vec<String>>,
101) -> Result<HStore<XvcPath>> {
102    let xvc_path_store: XvcStore<XvcPath> = xvc_root.load_store()?;
103    filter_targets_from_store(output_snd, xvc_root, &xvc_path_store, current_dir, targets)
104}
105
106/// Filters the paths in the store by given globs.
107///
108/// If `targets` is None, returns all paths in the store.
109///
110/// If `current_dir` is not the root, all targets are prefixed with it.
111pub fn filter_targets_from_store(
112    output_snd: &XvcOutputSender,
113    xvc_root: &XvcRoot,
114    xvc_path_store: &XvcStore<XvcPath>,
115    current_dir: &AbsolutePath,
116    targets: &Option<Vec<String>>,
117) -> Result<HStore<XvcPath>> {
118    // If we are not in the root, we add current dir to all targets and recur.
119    if *current_dir != *xvc_root.absolute_path() {
120        let cwd = current_dir
121            .strip_prefix(xvc_root.absolute_path())?
122            .to_str()
123            .unwrap();
124        let targets = match targets {
125            Some(targets) => targets.iter().map(|t| format!("{cwd}{t}")).collect(),
126            None => vec![cwd.to_string()],
127        };
128
129        return filter_targets_from_store(
130            output_snd,
131            xvc_root,
132            xvc_path_store,
133            xvc_root.absolute_path(),
134            &Some(targets),
135        );
136    }
137
138    if let Some(targets) = targets {
139        let paths =
140            filter_paths_by_globs(output_snd, xvc_root, xvc_path_store, targets.as_slice())?;
141        Ok(paths)
142    } else {
143        Ok(xvc_path_store.into())
144    }
145}
146
147/// Filter a set of paths by a set of globs. The globs are compiled into a
148/// GlobSet and paths are checked against the set.
149///
150/// If a target ends with /, it's considered a directory and all its children are also selected.
151pub fn filter_paths_by_globs(
152    output_snd: &XvcOutputSender,
153    xvc_root: &XvcRoot,
154    paths: &XvcStore<XvcPath>,
155    globs: &[String],
156) -> Result<HStore<XvcPath>> {
157    if globs.is_empty() {
158        return Ok(paths.into());
159    }
160
161    // Ensure directories end with /
162    let globs = globs
163        .iter()
164        .map(|g| {
165            if !g.ends_with('/') && !g.contains('*') {
166                let slashed = format!("{g}/");
167                // We don't track directories. Instead we look for files that start with the directory.
168                if paths.any(|_, p| p.as_str().starts_with(&slashed)) {
169                    slashed
170                } else {
171                    g.clone()
172                }
173            } else {
174                g.clone()
175            }
176        })
177        .collect::<Vec<String>>();
178
179    let mut glob_matcher = build_glob_matcher(output_snd, xvc_root, &globs)?;
180    let paths = paths
181        .iter()
182        .filter_map(|(e, p)| {
183            if glob_matcher.is_match(p.as_str()) {
184                Some((*e, p.clone()))
185            } else {
186                None
187            }
188        })
189        .collect();
190
191    Ok(paths)
192}
193
194/// Builds a glob matcher based on the provided directory and glob patterns.
195///
196/// # Arguments
197///
198/// * `output_snd`: A sender for output messages.
199/// * `dir`: The directory to which the glob patterns will be applied.
200/// * `globs`: A slice of glob patterns as strings.
201///
202/// # Returns
203///
204/// * `Result<Glob>`: A `Result` that contains the `Glob` matcher if successful, or an error if not.
205///
206/// # Errors
207///
208/// This function will return an error if any of the glob patterns are invalid.
209///
210pub fn build_glob_matcher(
211    output_snd: &XvcOutputSender,
212    dir: &Path,
213    globs: &[String],
214) -> Result<Glob> {
215    let mut glob_matcher = Glob::default();
216    globs.iter().for_each(|t| {
217        if t.ends_with('/') {
218            if !glob_matcher.add(&format!("{t}**")) {
219                error!(output_snd, "Error in glob: {t}");
220            }
221        } else if !t.contains('*') {
222            let abs_target = dir.join(Path::new(t));
223            if abs_target.is_dir() {
224                if !glob_matcher.add(&format!("{t}/**")) {
225                    error!(output_snd, "Error in glob: {t}")
226                }
227            } else if !glob_matcher.add(t) {
228                error!(output_snd, "Error in glob: {t}")
229            }
230        } else if !glob_matcher.add(t) {
231            error!(output_snd, "Error in glob: {t}")
232        }
233    });
234    Ok(glob_matcher)
235}
236
237/// Converts targets to a map of XvcPaths and their metadata. It walks the file
238/// system with [`all_paths_and_metadata`]. This is aimed towards `xvc file
239/// track`, `xvc file hash` and similar commands where we work with the existing
240/// files.
241///
242/// This walks all the repository. It doesn't try to optimize the walk by
243/// selecting targets first, because,
244/// - This is a premature optimization.
245/// - We need to consider ignore files and this requires to start a walk from
246///   the root.
247///
248/// If some day we need to optimize first walking the ignores, then walking the
249/// directories in the targets, I'd be glad that this is used in very large
250/// repositories.
251pub fn targets_from_disk(
252    output_snd: &XvcOutputSender,
253    xvc_root: &XvcRoot,
254    current_dir: &AbsolutePath,
255    targets: &Option<Vec<String>>,
256    filter_git_paths: bool,
257) -> Result<XvcPathMetadataMap> {
258    // If we are not in the root, we add current dir to all targets and recur.
259    if *current_dir != *xvc_root.absolute_path() {
260        let cwd = current_dir
261            .strip_prefix(xvc_root.absolute_path())?
262            .to_str()
263            .unwrap();
264
265        let cwd = if cwd.ends_with('/') {
266            cwd.to_owned()
267        } else {
268            format!("{cwd}/")
269        };
270
271        let targets = match targets {
272            Some(targets) => targets.iter().map(|t| format!("{cwd}{t}")).collect(),
273            None => vec![cwd.to_string()],
274        };
275        return targets_from_disk(
276            output_snd,
277            xvc_root,
278            xvc_root.absolute_path(),
279            &Some(targets),
280            filter_git_paths,
281        );
282    }
283
284    let has_globs_or_dirs = targets
285        .as_ref()
286        .map(|targets| {
287            targets.iter().any(|t| {
288                t.contains('*') || t.ends_with('/') || t.contains('/') || PathBuf::from(t).is_dir()
289            })
290        })
291        // None means all paths
292        .unwrap_or(true);
293    // If there are no globs/directories in the targets, no need to retrieve all the paths
294    // here.
295
296    let all_paths = if has_globs_or_dirs {
297        all_paths_and_metadata(xvc_root).0
298    } else {
299        // FIXME: Move this to a function
300        let (pmm, _) = path_metadata_map_from_file_targets(
301            output_snd,
302            COMMON_IGNORE_PATTERNS,
303            xvc_root,
304            // This should be ok as we checked empty condition on has_globs_or_dirs
305            targets.clone().unwrap(),
306            &xvc_core::walker::WalkOptions::xvcignore(),
307        )?;
308        let mut xpmm = HashMap::new();
309
310        pmm.into_iter().for_each(|pm| {
311            let md: XvcMetadata = XvcMetadata::from(pm.metadata);
312            let rxp = XvcPath::new(xvc_root, xvc_root.absolute_path(), &pm.path);
313            match rxp {
314                Ok(xvc_path) => {
315                    xpmm.insert(xvc_path, md);
316                }
317                Err(e) => {
318                    e.warn();
319                }
320            }
321        });
322        xpmm
323    };
324
325    // Return false when the path is a git path
326
327    let git_files: HashSet<String> = if filter_git_paths {
328        let git_command_str = xvc_root.config().get_str("git.command")?.option;
329        let git_command = get_absolute_git_command(&git_command_str)?;
330        get_git_tracked_files(
331            &git_command,
332            xvc_root
333                .absolute_path()
334                .to_str()
335                .expect("xvc_root must have a path"),
336        )?
337        .into_iter()
338        .collect()
339    } else {
340        HashSet::new()
341    };
342
343    let mut git_path_filter: Box<dyn FnMut(&XvcPath) -> bool> = if filter_git_paths {
344        Box::new(|p: &XvcPath| {
345            let path_str = p.as_str();
346            let path_str = path_str
347                .strip_prefix(
348                    xvc_root
349                        .absolute_path()
350                        .to_str()
351                        .expect("xvc_root must have a path"),
352                )
353                .unwrap_or(path_str);
354            !git_files.contains(path_str)
355        })
356    } else {
357        Box::new(|_p: &XvcPath| true)
358    };
359
360    if let Some(targets) = targets {
361        // FIXME: Is this a bug? When targets is empty, we can return all files.
362        // Targets should be None to return all paths but what about we pass Some([])?
363
364        if targets.is_empty() {
365            return Ok(XvcPathMetadataMap::new());
366        }
367
368        let mut glob_matcher = build_glob_matcher(output_snd, xvc_root, targets)?;
369        Ok(all_paths
370            .into_iter()
371            .filter(|(p, _)| git_path_filter(p))
372            .filter(|(p, _)| glob_matcher.is_match(p.as_str()))
373            .collect())
374    } else {
375        Ok(all_paths
376            .into_iter()
377            .filter(|(p, _)| git_path_filter(p))
378            .collect())
379    }
380}
381
382/// Selects only the files in `targets` by matching them with the metadata in `xvc_metadata_store`.
383pub fn only_file_targets(
384    xvc_metadata_store: &XvcStore<XvcMetadata>,
385    targets: &HStore<XvcPath>,
386) -> Result<HStore<XvcPath>> {
387    let target_metadata = xvc_metadata_store.subset(targets.keys().copied())?;
388
389    assert! {
390        target_metadata.len() == targets.len(),
391        "The number of targets and the number of target metadata should be the same."
392    }
393
394    let target_files = targets.subset(
395        target_metadata
396            .filter(|_, xmd| xmd.file_type == XvcFileType::File)
397            .keys()
398            .copied(),
399    )?;
400
401    Ok(target_files)
402}
403
404/// Return the metadata of targets. This is used in various functions to get the
405/// changed files in repository. When you want to get all files and their
406/// metadata, it may be better to use [all_paths_and_metadata].
407pub fn xvc_path_metadata_map_from_disk(
408    xvc_root: &XvcRoot,
409    targets: &HStore<XvcPath>,
410) -> XvcPathMetadataMap {
411    targets
412        .par_iter()
413        .map(|(_, xp)| {
414            let p = xp.to_absolute_path(xvc_root);
415            let xmd = XvcMetadata::from(p.metadata());
416            (xp.clone(), xmd)
417        })
418        .collect()
419}
420
421/// Copies / links `cache_path` to `xvc_path` with `recheck_method`.
422/// WARNING: If `xvc_path` is already present, it will be deleted first.
423/// It also sends an ignore operation to `ignore_writer`.
424pub fn recheck_from_cache(
425    output_snd: &XvcOutputSender,
426    xvc_root: &XvcRoot,
427    xvc_path: &XvcPath,
428    cache_path: &XvcCachePath,
429    recheck_method: RecheckMethod,
430    ignore_writer: &Sender<IgnoreOp>,
431) -> Result<()> {
432    if let Some(parent) = xvc_path.parents().first() {
433        let parent_dir = parent.to_absolute_path(xvc_root);
434        if !parent_dir.exists() {
435            fs::create_dir_all(parent_dir)?;
436            uwr!(
437                ignore_writer.send(Some(IgnoreOperation::IgnoreDir {
438                    dir: parent.clone(),
439                })),
440                output_snd
441            );
442        }
443    }
444    let cache_path = cache_path.to_absolute_path(xvc_root);
445    let path = xvc_path.to_absolute_path(xvc_root);
446    // If the file already exists, we delete it.
447    if path.exists() {
448        fs::remove_file(&path)?;
449    }
450
451    match recheck_method {
452        RecheckMethod::Copy => {
453            copy_file(output_snd, cache_path, path)?;
454        }
455        RecheckMethod::Hardlink => {
456            fs::hard_link(&cache_path, &path)?;
457            info!(output_snd, "[HARDLINK] {} -> {}", cache_path, path);
458        }
459        RecheckMethod::Symlink => {
460            make_symlink(&cache_path, &path)?;
461            info!(output_snd, "[SYMLINK] {} -> {}", cache_path, path);
462        }
463        RecheckMethod::Reflink => {
464            reflink(output_snd, cache_path, path)?;
465        }
466    }
467    uwr!(
468        ignore_writer.send(Some(IgnoreOperation::IgnoreFile {
469            file: xvc_path.clone(),
470        })),
471        output_snd
472    );
473    Ok(())
474}
475
476#[cfg(feature = "reflink")]
477fn reflink(
478    output_snd: &XvcOutputSender,
479    cache_path: AbsolutePath,
480    path: AbsolutePath,
481) -> Result<()> {
482    match reflink::reflink(&cache_path, &path) {
483        Ok(_) => {
484            info!(output_snd, "[REFLINK] {} -> {}", cache_path, path);
485            Ok(())
486        }
487        Err(e) => {
488            warn!(
489                output_snd,
490                "File system doesn't support reflink. {e}. Copying instead."
491            );
492            copy_file(output_snd, cache_path, path)
493        }
494    }
495}
496
497fn copy_file(
498    output_snd: &XvcOutputSender,
499    cache_path: AbsolutePath,
500    path: AbsolutePath,
501) -> Result<()> {
502    fs::copy(&cache_path, &path)?;
503    set_writable(&path)?;
504    info!(output_snd, "[COPY] {} -> {}", cache_path, path);
505    Ok(())
506}
507
508#[cfg(not(unix))]
509pub fn set_writable(path: &Path) -> Result<()> {
510    let mut perm = path.metadata()?.permissions();
511    perm.set_readonly(false);
512    fs::set_permissions(path, perm)?;
513    Ok(())
514}
515
516#[cfg(not(unix))]
517pub fn set_readonly(path: &Path) -> Result<()> {
518    let mut perm = path.metadata()?.permissions();
519    perm.set_readonly(true);
520    fs::set_permissions(path, perm)?;
521    Ok(())
522}
523
524/// Set a path to user writable on unix systems.
525#[cfg(unix)]
526pub fn set_writable(path: &Path) -> Result<()> {
527    let mut permissions = path.metadata()?.permissions();
528    let mode = permissions.mode();
529    let new_mode = mode | 0o200;
530    permissions.set_mode(new_mode);
531    fs::set_permissions(path, permissions)?;
532    Ok(())
533}
534
535/// Set a path to readonly on unix systems.
536#[cfg(unix)]
537pub fn set_readonly(path: &Path) -> Result<()> {
538    let mut permissions = path.metadata()?.permissions();
539    let mode = permissions.mode();
540    let new_mode = mode & !0o200;
541    permissions.set_mode(new_mode);
542    fs::set_permissions(path, permissions)?;
543    Ok(())
544}
545
546#[cfg(not(feature = "reflink"))]
547fn reflink(
548    output_snd: &XvcOutputSender,
549    cache_path: AbsolutePath,
550    path: AbsolutePath,
551) -> Result<()> {
552    warn!(
553        output_snd,
554        "Xvc isn't compiled with reflink support. Copying the file."
555    );
556    copy_file(output_snd, cache_path, path)
557}
558
559/// All cache paths for all xvc paths.
560/// There are extracted from the event logs.
561pub fn cache_paths_for_xvc_paths(
562    output_snd: &XvcOutputSender,
563    all_paths: &XvcStore<XvcPath>,
564    all_content_digests: &XvcStore<ContentDigest>,
565) -> Result<HStore<Vec<XvcCachePath>>> {
566    // Get cache paths for each
567
568    let mut all_cache_paths: HStore<Vec<XvcCachePath>> = HStore::new();
569
570    // Find all cache paths
571    // We have 1-1 relationship between content digests and paths.
572    // So, in order to get earlier versions, we check the event log.
573    for (xe, xp) in all_paths.iter() {
574        let path_digest_events: EventLog<ContentDigest> =
575            all_content_digests.all_event_log_for_entity(*xe)?;
576        let cache_paths = path_digest_events
577            .iter()
578            .filter_map(|cd_event| match cd_event {
579                xvc_core::Event::Add { entity: _, value } => {
580                    let xcp = uwr!(XvcCachePath::new(xp, value), output_snd
581                 );
582
583                    Some(xcp)
584                }
585                xvc_core::Event::Remove { entity } => {
586                    // We don't delete ContentDigests of available XvcPaths.
587                    // This is an error.
588                    error!(
589                    output_snd,
590                    "There shouldn't be a remove event for content digest of {xp}. Please report this. {}",
591                    entity
592                );
593                    None
594                }
595            })
596            .collect();
597        all_cache_paths.insert(*xe, cache_paths);
598    }
599
600    Ok(all_cache_paths)
601}
602
603/// Moves the `path` to `cache_path`.
604///
605/// It creates the cache directory and sets the cache file read only.
606///
607/// It overwrites the cache file if it already exists.
608///
609/// The [PathSync] struct is used to lock the paths during the operation, so that no two threads
610/// try to accessl to the same path at the same time.
611// TODO: Remove this when we set unix permissions in platform dependent fashion
612#[allow(clippy::permissions_set_readonly_false)]
613pub fn move_to_cache(
614    path: &AbsolutePath,
615    cache_path: &AbsolutePath,
616    path_sync: &PathSync,
617) -> Result<()> {
618    let cache_dir = cache_path.parent().ok_or(Error::InternalError {
619        message: "Cache path has no parent.".to_string(),
620    })?;
621    // We don't lock the path_sync here because we don't want to block other threads.
622    path_sync
623        .with_sync_abs_path(path, |path| {
624            path_sync.with_sync_abs_path(cache_path, |cache_path| {
625                if !cache_dir.exists() {
626                    fs::create_dir_all(cache_dir)?;
627                }
628                // Set to writable
629                let mut dir_perm = cache_dir.metadata()?.permissions();
630                dir_perm.set_readonly(false);
631                fs::set_permissions(cache_dir, dir_perm)?;
632
633                fs::rename(path, cache_path)
634                    .map_err(|source| XvcWalkerError::IoError { source })?;
635                let mut file_perm = cache_path.metadata()?.permissions();
636                file_perm.set_readonly(true);
637                fs::set_permissions(cache_path, file_perm.clone())?;
638                let mut dir_perm = cache_dir.metadata()?.permissions();
639                dir_perm.set_readonly(true);
640                fs::set_permissions(cache_dir, dir_perm)?;
641                Ok(())
642            })
643        })
644        .map_err(|e| e.into())
645}
646
647/// Move an xvc_path to the cache path.
648/// Uses [move_to_cache]
649pub fn move_xvc_path_to_cache(
650    xvc_root: &XvcRoot,
651    xvc_path: &XvcPath,
652    cache_path: &XvcCachePath,
653    path_sync: &PathSync,
654) -> Result<()> {
655    let path = xvc_path.to_absolute_path(xvc_root);
656    let cache_path = cache_path.to_absolute_path(xvc_root);
657    move_to_cache(&path, &cache_path, path_sync)
658}
659
660/// Record store records checking their Diff.
661/// It loads the store and creates a new store by [apply_diff], then saves it.
662/// TODO: This may be optimized for in place update when stores get larger.
663pub fn update_store_records<T>(
664    xvc_root: &XvcRoot,
665    diffs: &DiffStore<T>,
666    add_new: bool,
667    remove_missing: bool,
668) -> Result<()>
669where
670    T: Storable,
671{
672    let records = xvc_root.load_store::<T>()?;
673    let new_store = apply_diff(&records, diffs, add_new, remove_missing)?;
674    xvc_root.save_store(&new_store)?;
675    Ok(())
676}