refman/
project.rs

1use std::{
2    collections::HashMap,
3    env::{self, current_dir},
4    fs::{self, File, read_to_string},
5    path::{Path, PathBuf},
6    str::FromStr,
7    sync::Arc,
8};
9
10use color_eyre::eyre::{Error as ColorError, eyre};
11use futures::future::try_join_all;
12use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
13use jiff::Timestamp;
14use log::{debug, info, warn};
15use prettytable::{Table, row};
16use reqwest::Client;
17use serde::{Deserialize, Serialize};
18use tokio::task::JoinHandle;
19
20use crate::{
21    EntryError, RegistryError, ValidationError,
22    data::{DownloadStatus, RefDataset},
23    downloads::{check_url, request_dataset},
24    validate::UnvalidatedFile,
25};
26
27/// A reference manager for all data associated with your bioinformatics project.
28///
29/// Projects are the top-level abstraction in refman, allowing you to register, track,
30/// download, and manage reference files like FASTA, Genbank, GFA, GFF, GTF and BED files
31/// for your bioinformatics work. A Project maintains a registry of datasets, where each dataset
32/// has a unique label and can contain references to multiple file types.
33///
34/// The Project struct provides methods to:
35/// - Initialize new reference management projects
36/// - Register new datasets or update existing ones
37/// - Download registered datasets from remote URLs
38/// - Remove datasets from the registry
39/// - Pretty print the current state of registered datasets
40///
41/// Projects can be either local (stored in ./refman.toml) or global (stored in ~/.refman/refman.toml).
42/// The registry location can also be customized via the `REFMAN_HOME` environment variable.
43///
44/// Each dataset in a project is tracked with a unique label and can contain optional URLs pointing
45/// to reference files in standard bioinformatics formats (FASTA, Genbank, GFA, GFF, GTF, BED).
46/// The registry maintains metadata like when it was last modified and optional title/description fields.
47///
48/// # Examples
49///
50/// ```no_run
51/// # use refman::project::Project;
52/// // Create a new local project
53/// let project = Project::new(
54///     Some("My Assembly Project".to_string()),
55///     Some("Reference data for genome assembly".to_string()),
56///     false
57/// );
58/// ```
59///
60/// The Project struct integrates with other refman types like `RefDataset` for managing individual
61/// reference datasets and `RegistryOptions` for configuring where and how the registry is stored.
62#[derive(Debug, Default, Serialize, Deserialize, Clone)]
63pub struct Project {
64    project: Registry,
65}
66
67type MultiDownloadResults = Vec<Result<UnvalidatedFile, ColorError>>;
68
69impl Project {
70    /// Creates a new Project struct with optional title and description strings and
71    /// a boolean flag controlling if the project's registry file is placed in a
72    /// global location (`REFMAN_HOME`or ~/.refman) or locally (./refman.toml).
73    ///
74    /// A Project is the top-level struct for managing reference data in refman. It
75    /// maintains a registry of reference genomics datasets, where each dataset can
76    /// include references to multiple standard bioinformatics file formats like
77    /// FASTA, Genbank, GFA, GFF, GTF and BED files. The registry stores metadata
78    /// about each reference dataset including when it was last modified.
79    ///
80    /// The registry file location depends on the `global` parameter:
81    /// - If `global=false` (default), creates a local refman.toml in current directory
82    /// - If `global=true`, uses either `$REFMAN_HOME/.refman/refman.toml` or ~/.refman/refman.toml
83    ///
84    /// # Arguments
85    ///
86    /// * `title` - Optional title for the project
87    /// * `description` - Optional description of the project
88    /// * `global` - Whether to store the registry file globally or locally
89    ///
90    /// # Returns
91    ///
92    /// Returns a new Project instance initialized with the provided title, description
93    /// and global flag. The internal Registry is created with default values for
94    /// `last_modified` timestamp and an empty datasets vector.
95    fn new(title: Option<String>, description: Option<String>, global: bool) -> Self {
96        // fill in any user provided title, description, or global information on
97        // top of the information stored in a project by default
98        let registry = Registry {
99            title,
100            description,
101            global,
102            ..Registry::default()
103        };
104
105        Self { project: registry }
106    }
107
108    /// Returns a read-only slice of all reference datasets currently registered in the project.
109    ///
110    /// This method provides access to the raw collection of `RefDataset` entries stored in the
111    /// project's registry. Each `RefDataset` represents a labeled collection of bioinformatics
112    /// reference files, potentially including FASTA, Genbank, GFA, GFF, GTF and BED formats.
113    ///
114    /// This accessor is useful for:
115    /// - Inspecting the currently registered datasets without modifying them
116    /// - Iterating over registered datasets to check their properties
117    /// - Filtering datasets based on custom criteria
118    /// - Accessing individual dataset labels and file URLs
119    ///
120    /// The returned slice allows read-only access to ensure the registry's integrity is maintained.
121    /// For mutable access, use `datasets_mut()` instead. For taking ownership of the datasets,
122    /// use `datasets_owned()`.
123    ///
124    /// # Returns
125    ///
126    /// A read-only slice containing all registered `RefDataset` entries in the project.
127    /// Returns an empty slice if no datasets are registered.
128    #[inline]
129    #[must_use]
130    pub fn datasets(&self) -> &[RefDataset] {
131        self.project.datasets.as_slice()
132    }
133
134    /// Returns a mutable slice of all reference datasets registered in the project.
135    ///
136    /// This method provides mutable access to the raw collection of `RefDataset` entries stored in
137    /// the project's registry. Each `RefDataset` represents a labeled collection of bioinformatics
138    /// reference files, potentially including FASTA, Genbank, GFA, GFF, GTF and BED formats.
139    ///
140    /// Mutable access allows modifying existing datasets, for example to:
141    /// - Update file URLs for existing datasets
142    /// - Modify dataset labels or other metadata
143    /// - Add or remove file references from datasets
144    /// - Reorder datasets within the registry
145    ///
146    /// Use this method with caution as it allows direct mutation of the registry state.
147    /// For read-only access, use `datasets()` instead. To take ownership of the datasets,
148    /// use `datasets_owned()`.
149    ///
150    /// # Returns
151    ///
152    /// A mutable slice containing all registered `RefDataset` entries in the project.
153    /// Returns an empty slice if no datasets are registered.
154    #[inline]
155    pub fn datasets_mut(&mut self) -> &mut [RefDataset] {
156        self.project.datasets.as_mut_slice()
157    }
158
159    /// Takes ownership of all reference datasets registered in the project.
160    ///
161    /// This method provides a way to take ownership of the raw collection of `RefDataset` entries
162    /// stored in the project's registry, consuming the project in the process. Each `RefDataset`
163    /// represents a labeled collection of bioinformatics reference files, potentially including
164    /// FASTA, Genbank, GFA, GFF, GTF and BED formats.
165    ///
166    /// Taking ownership via `datasets_owned()` allows:
167    /// - Moving datasets out of the Project context entirely
168    /// - Transferring datasets between Projects
169    /// - Performing owned operations on datasets that require ownership
170    /// - Converting datasets into other data structures
171    ///
172    /// This is different from `datasets()` which provides read-only access an`datasets_mut()`
173    /// which provides mutable access but keeps ownership within the Project. Using `datasets_owned()`
174    /// consumes the Project instance.
175    ///
176    /// # Returns
177    ///
178    /// A Vec containing all registered `RefDataset` entries, transferring ownership from
179    /// the Project to the caller. Returns an empty Vec if no datasets were registered.
180    /// The Project instance is consumed in the process.
181    #[inline]
182    #[must_use]
183    pub fn datasets_owned(self) -> Vec<RefDataset> {
184        self.project.datasets
185    }
186
187    /// Returns a reference to a specific dataset from the Project's registry by its label.
188    ///
189    /// This method provides direct access to individual reference datasets stored in the project's
190    /// registry. It takes a label string and returns a reference to the matching `RefDataset` if one
191    /// exists. Each dataset in a refman Project has a unique label that identifies it, containing
192    /// optional references to various bioinformatics file formats (FASTA, Genbank, GFA, GFF, GTF, BED).
193    ///
194    /// The method enforces that:
195    /// - The label must exactly match a registered dataset (case-sensitive)
196    /// - Only one dataset can have a given label (unique key constraint)
197    /// - The dataset must exist in the registry
198    ///
199    /// This is commonly used to:
200    /// - Check details of specific registered datasets
201    /// - Access dataset file URLs before downloading
202    /// - Verify dataset registration status
203    /// - Extract dataset metadata
204    ///
205    /// The method complements other Project methods like `register()` an`download_dataset()`() in the
206    /// dataset management lifecycle. While those methods add and fetch datasets, `get_dataset()`
207    /// provides read access to verify and inspect registered data.
208    ///
209    /// # Arguments
210    ///
211    /// * `label` - The unique label identifying the dataset to retrieve
212    ///
213    /// # Returns
214    ///
215    /// Returns Ok(&RefDataset) with a reference to the matching dataset if found.
216    /// Returns `EntryError::LabelNotFound` if no dataset matches the provided label.
217    ///
218    /// # Errors
219    ///
220    /// Can return `EntryError::LabelNotFound` if the requested dataset label is not
221    /// registered in the project.
222    ///
223    /// # Panics
224    ///
225    /// This method will panic if:
226    /// - More than one dataset with the same label exists in the registry
227    ///   (indicates invalid state as labels must be unique)
228    /// - The filtered dataset collection contains an unexpected number of matches
229    ///   (should be exactly 1 match for a valid label)
230    #[inline]
231    pub fn get_dataset(&self, label: &str) -> Result<&RefDataset, EntryError> {
232        // pull in a read-only slice of the datasets currently in project state
233        let datasets = self.datasets();
234
235        // If a dataset isn't in the current project state, return a refman error
236        // wrapped in an anyhow error.
237        if datasets
238            .iter()
239            .map(|dataset| dataset.label.as_str())
240            .filter(|ds_label| *ds_label == label)
241            .collect::<Vec<&str>>()
242            .is_empty()
243        {
244            Err(EntryError::LabelNotFound(label.to_string()))?;
245        }
246
247        // make sure only one dataset matches the provided label, which must be a unique
248        // key
249        let entry: Vec<_> = datasets
250            .iter()
251            .filter(|dataset| dataset.label == label)
252            .collect();
253        assert_eq!(entry.len(), 1);
254
255        Ok(entry[0])
256    }
257
258    /// Returns a vector of all registered file URLs for a dataset with the given label.
259    ///
260    /// This method provides access to all file URLs registered for a dataset, combining any valid URLs
261    /// across the supported bioinformatics file formats (FASTA, Genbank, GFA, GFF, GTF, BED). The URLs
262    /// can then be used to download reference files, validate dataset completeness, or inspect available
263    /// file formats.
264    ///
265    /// The method will:
266    /// - Verify the dataset exists by the given label
267    /// - Extract all non-None URLs registered for that dataset
268    /// - Return them as a vector in a consistent order (FASTA, Genbank, etc.)
269    ///
270    /// This complements other dataset access methods by providing URL-specific functionality. While
271    /// `get_dataset()` returns the full dataset struct an`download_dataset()` handles file fetching,
272    /// `get_dataset_urls()` focuses specifically on URL access and validation.
273    ///
274    /// The method is used internally by `download_dataset()` to determine which files to fetch, but can
275    /// also be used directly to:
276    /// - Preview what files are available before downloading
277    /// - Extract URLs for custom download handling
278    /// - Verify dataset completeness
279    /// - Share dataset URLs
280    ///
281    /// # Arguments
282    ///
283    /// * `label` - The unique label identifying the dataset whose URLs should be retrieved
284    ///
285    /// # Returns
286    ///
287    /// Returns Ok(Vec<String>) containing all non-None URLs registered for the dataset.
288    /// Returns an empty vector if the dataset exists but has no URLs registered.
289    /// Returns `EntryError::LabelNotFound` if no dataset matches the provided label.
290    ///
291    /// # Errors
292    ///
293    /// Can return `EntryError::LabelNotFound` if the requested dataset label is not in the registry.
294    #[inline]
295    pub fn get_dataset_urls(&self, label: &str) -> Result<Vec<String>, EntryError> {
296        // access the dataset for the provided label
297        let dataset = self.get_dataset(label)?;
298
299        // build a vector based on the URLs that may or may not be available for downloading
300        let urls = vec![
301            dataset.fasta.clone(),
302            dataset.genbank.clone(),
303            dataset.gfa.clone(),
304            dataset.gff.clone(),
305            dataset.gtf.clone(),
306            dataset.bed.clone(),
307        ]
308        .into_iter()
309        .flatten()
310        .map(|download| download.url_owned())
311        .collect::<Vec<String>>();
312
313        Ok(urls)
314    }
315
316    /// Returns a vector of URLs for all reference data across all registered datasets.
317    ///
318    /// This method provides access to all file URLs registered in the project's datasets,
319    /// aggregating URLs from each dataset and each supported bioinformatics file format
320    /// (FASTA, Genbank, GFA, GFF, GTF, BED). It is useful for:
321    /// - Getting an overview of all reference data in the project
322    /// - Batch downloading all registered files
323    /// - Validating URLs across the entire registry
324    /// - Sharing/exporting full URL lists
325    ///
326    /// The method processes each dataset sequentially, collecting any non-None URLs into
327    /// a single vector. URLs are gathered in a consistent order per dataset:
328    /// FASTA -> Genbank -> GFA -> GFF -> GTF -> BED.
329    ///
330    /// Unlike `get_dataset_urls()` which operates on a single labeled dataset, this method
331    /// provides complete URL access across the entire registry. It complements other Project
332    /// methods like `download_dataset()` by enabling bulk operations across all reference data.
333    ///
334    /// The method enforces URL validity by checking that:
335    /// - No empty URLs are included
336    /// - All URLs use either http:// or https:// protocols
337    ///
338    /// # Returns
339    ///
340    /// Returns Ok(Vec<String>) containing all valid URLs across all datasets.
341    /// Returns an empty vector if no URLs are registered.
342    ///
343    /// # Errors
344    ///
345    /// Can return `EntryError` variants if:
346    /// - Dataset access fails
347    /// - URL validation fails
348    /// - Project state is invalid
349    ///
350    /// # Panics
351    ///
352    /// This method will panic if:
353    /// - Empty URLs are found in datasets (invalid state)
354    /// - URLs with invalid protocols are found (must be http/https)
355    #[inline]
356    pub fn get_all_urls(&self) -> Result<Vec<String>, EntryError> {
357        // access the dataset for the provided label
358        let datasets = self.datasets();
359
360        // build a vector based on the URLs that may or may not be available for downloading
361        let mut all_urls = Vec::new();
362        for dataset in datasets {
363            let urls = vec![
364                dataset.fasta.clone(),
365                dataset.genbank.clone(),
366                dataset.gfa.clone(),
367                dataset.gff.clone(),
368                dataset.gtf.clone(),
369                dataset.bed.clone(),
370            ]
371            .into_iter()
372            .flatten()
373            .map(|download| download.url_owned())
374            .collect::<Vec<String>>();
375            all_urls.extend(urls);
376        }
377        assert!(
378            all_urls.iter().all(|url| !url.is_empty()),
379            "Found empty URLs in dataset"
380        );
381        assert!(
382            all_urls
383                .iter()
384                .all(|url| url.starts_with("http://") || url.starts_with("https://")),
385            "Found invalid URL protocols"
386        );
387
388        Ok(all_urls)
389    }
390
391    /// Checks if a dataset with a given label is registered in the project.
392    ///
393    /// This method searches through the project's registry to determine if a dataset
394    /// with the specified label exists. Each dataset in a refman Project must have a
395    /// unique label that identifies it - this label acts as the primary key for the
396    /// dataset within the registry.
397    ///
398    /// This method is useful for:
399    /// - Validating labels before attempting to register or update datasets
400    /// - Checking existence of specific datasets before trying to download them
401    /// - General queries about what data is available in the project
402    ///
403    /// The check is case-sensitive - "genome" and "Genome" are considered different labels.
404    /// Labels must be unique within a project's registry.
405    ///
406    /// # Arguments
407    ///
408    /// * `label` - The label string to search for in the registry
409    ///
410    /// # Returns
411    ///
412    /// Returns `true` if a dataset with the given label exists in the registry,
413    /// `false` otherwise. Note that this only checks for label existence, not whether
414    /// the dataset has any file URLs registered or if those files are accessible.
415    #[must_use]
416    pub fn is_registered(&self, label: &str) -> bool {
417        // Iterate through a slice of the available datasets, keeping only the dataset
418        // with a label matching what the user has requested. Return true if the result
419        // is not empty and false if it is.
420        !self
421            .datasets()
422            .iter()
423            .filter(|dataset| dataset.label == label)
424            .collect::<Vec<&RefDataset>>()
425            .is_empty()
426    }
427
428    /// Registers a new dataset or updates an existing dataset in the Project's registry.
429    ///
430    /// This is one of the core methods for managing reference data in refman. It takes a `RefDataset`
431    /// struct containing a unique label and optional URLs for various bioinformatics file formats
432    /// (FASTA, Genbank, GFA, GFF, GTF, BED, TAR) and either:
433    ///
434    /// - Adds it as a new dataset if the label doesn't exist in the registry yet
435    /// - Updates an existing dataset with any new URLs provided if the label matches
436    ///
437    /// When updating an existing dataset, only fields that are Some(url) in the new `RefDataset`
438    /// will overwrite the existing dataset's fields. This allows for incremental updates where
439    /// you can add new file references to a dataset over time without having to re-specify
440    /// existing URLs.
441    ///
442    /// The registry enforces that dataset labels must be unique - you cannot have two datasets
443    /// with the same label. This allows the label to act as a primary key for looking up and
444    /// managing datasets within the project.
445    ///
446    /// # Arguments
447    ///
448    /// * `new_dataset` - A `RefDataset` struct containing the label and optional file URLs to
449    ///   register or update. The label field is required and must be unique within the registry.
450    ///
451    /// # Returns
452    ///
453    /// Returns Ok(Project) with the updated Project if registration succeeds, or an `EntryError`
454    /// if there are issues with the dataset registration (e.g. invalid state detected).
455    ///
456    /// # Examples
457    ///
458    /// To register a new dataset:
459    /// ```rust,no_run
460    /// # use refman::{project::Project, data::RefDataset};
461    /// let mut project = Project::new(None, None, false);
462    /// let dataset = RefDataset {
463    ///     label: "genome".into(),
464    ///     fasta: Some("https://example.com/genome.fasta".into()),
465    ///     ..Default::default()
466    /// };
467    /// project = project.register(dataset).unwrap();
468    /// ```
469    ///
470    /// The registration process will either add this as a new dataset if "genome" is not yet
471    /// registered, or update the existing "genome" dataset with the new FASTA URL if it exists.
472    ///
473    /// # Errors
474    ///
475    /// This method can return several types of errors:
476    /// - `EntryError::LabelNotFound` if the dataset being registered cannot be found during updates
477    /// - `EntryError::FinalEntry` if registering this dataset would leave the registry empty
478    /// - Filesystem errors from reading/writing the registry file
479    /// - Serialization errors when encoding/decoding the registry TOML
480    /// - Permission errors when accessing registry files
481    /// - IO errors if registry files or directories cannot be accessed
482    /// - Environment variable errors if `REFMAN_HOME` is invalid
483    /// - Path resolution errors for invalid registry paths
484    ///
485    /// # Panics
486    ///
487    /// This method will panic if multiple datasets matching the given label are found in
488    /// the registry. This should never happen as labels must be unique, but represents an
489    /// invalid state that requires immediate attention.
490    ///
491    pub async fn register(mut self, new_dataset: RefDataset) -> Result<Self, EntryError> {
492        let Some(dataset_match_idx) = self.get_dataset_idx(&new_dataset.label) else {
493            // if the label wasn't found, it's not in the registry, so it can be safely
494            // appended without any fear of duplication
495            self.project.datasets.push(new_dataset);
496            return Ok(self);
497        };
498
499        // pull in a mutable reference to the slice of datasets, get a mutable reference to the one
500        // dataset we need to update (using the index), and then update each of it's fields if the
501        // user provided values for them.
502        let previous_datasets = self.datasets_mut();
503        let dataset_to_update = &mut previous_datasets[dataset_match_idx];
504
505        // use pattern matching here to get exhaustiveness checking instead of if-else
506        match new_dataset {
507            // if it's a FASTA, make sure the link points to a resource that exists and then update
508            // the registry with it
509            RefDataset {
510                fasta: Some(ref fasta),
511                ..
512            } => {
513                // TODO: All checkes here, including whether a URI is likely a URL, whether that URL is valid, and
514                // whether a local path exists, could be included in a guard, relegating the single unhappy path
515                // where a provided URI is not a URL that exists and is not a path that exists, to a single branch arm
516                // for all file types.
517                let url_str = fasta.url();
518                if is_likely_url(url_str) {
519                    let _ = check_url(url_str).await?;
520                } else if !PathBuf::from(url_str).is_file() {
521                    return Err(EntryError::InvalidURL(eyre!(
522                        "The provided uri {url_str} was not a web link, nor was it a local file path pointing to something that exists."
523                    )));
524                }
525                dataset_to_update.fasta = new_dataset.fasta;
526            },
527
528            // Do the same thing but with a putative genbank file
529            RefDataset {
530                genbank: Some(ref genbank),
531                ..
532            } => {
533                let url_str = genbank.url();
534                if is_likely_url(url_str) {
535                    let _ = check_url(url_str).await?;
536                }
537                dataset_to_update.genbank = new_dataset.genbank;
538            },
539
540            // Do the same thing but with a putative GFA file
541            RefDataset {
542                gfa: Some(ref gfa), ..
543            } => {
544                let url_str = gfa.url();
545                if is_likely_url(url_str) {
546                    let _ = check_url(url_str).await?;
547                }
548                dataset_to_update.gfa = new_dataset.gfa;
549            },
550
551            // Do the same thing but with a putative GFF file
552            RefDataset {
553                gff: Some(ref gff), ..
554            } => {
555                let url_str = gff.url();
556                if is_likely_url(url_str) {
557                    let _ = check_url(url_str).await?;
558                }
559                dataset_to_update.gff = new_dataset.gff;
560            },
561
562            // Do the same thing but with a putative GTF file
563            RefDataset {
564                gtf: Some(ref gtf), ..
565            } => {
566                let url_str = gtf.url();
567                if is_likely_url(url_str) {
568                    let _ = check_url(url_str).await?;
569                }
570                dataset_to_update.gtf = new_dataset.gtf;
571            },
572
573            // Do the same thing but with a putative BED file
574            RefDataset {
575                bed: Some(ref bed), ..
576            } => {
577                let url_str = bed.url();
578                if is_likely_url(url_str) {
579                    let _ = check_url(url_str).await?;
580                }
581                dataset_to_update.bed = new_dataset.bed;
582            },
583
584            // Do the same thing but with a putative TAR file
585            RefDataset {
586                tar: Some(ref tar), ..
587            } => {
588                let url_str = tar.url();
589                if is_likely_url(url_str) {
590                    let _ = check_url(url_str).await?;
591                }
592                dataset_to_update.tar = new_dataset.tar;
593            },
594
595            // If somehow this state has slipped through the cracks, it means there's no file to
596            // update the registry with, which is a `LabelButNoFiles` error
597            RefDataset {
598                label: _,
599                fasta: None,
600                genbank: None,
601                gfa: None,
602                gff: None,
603                gtf: None,
604                bed: None,
605                tar: None,
606            } => return Err(EntryError::LabelButNoFiles),
607        }
608
609        // If we've made it this far, all is well; return the mutated instance of
610        // the project.
611        Ok(self)
612    }
613
614    #[inline]
615    fn get_dataset_idx(&self, label: &str) -> Option<usize> {
616        // find the index of the old dataset to be updated with new information from
617        // the user
618        let dataset_match_indices: Vec<_> = self
619            .datasets()
620            .iter()
621            .enumerate()
622            .filter(|(_i, dataset)| dataset.label == label)
623            .map(|(i, _)| i)
624            .collect();
625
626        if dataset_match_indices.is_empty() {
627            return None;
628        }
629
630        // Make sure that the above system that we *assume* will work doesn't actually break (it should never
631        // be possible to have two dataset entries with the same label).
632        assert_eq!(
633            dataset_match_indices.len(),
634            1,
635            "Invalid state slipped through the cracks when identifying which dataset should be updated with the new information for dataset '{}'. Somehow, multiple indices were returned: {:?}",
636            label,
637            &dataset_match_indices
638        );
639
640        // With that assert passing, pull out the index usize
641        Some(dataset_match_indices[0])
642    }
643
644    #[allow(clippy::similar_names)]
645    pub(crate) fn collect_downloads(
646        &self,
647        label: Option<&str>,
648        target_dir: &Path,
649    ) -> Vec<(RefDataset, Vec<UnvalidatedFile>)> {
650        let datasets = if let Some(label) = label {
651            self.clone()
652                .datasets_owned()
653                .into_iter()
654                .filter(|dataset| dataset.label == label)
655                .collect::<Vec<_>>()
656        } else {
657            self.clone()
658                .datasets_owned()
659                .into_iter()
660                .collect::<Vec<_>>()
661        };
662        assert_ne!(0, datasets.len());
663        datasets
664            .into_iter()
665            .map(|dataset| {
666                let fasta = dataset.get_fasta_download(target_dir);
667                let genbank = dataset.get_genbank_download(target_dir);
668                let gfa = dataset.get_gfa_download(target_dir);
669                let gtf = dataset.get_gtf_download(target_dir);
670                let gff = dataset.get_gff_download(target_dir);
671                let bed = dataset.get_bed_download(target_dir);
672                let tar = dataset.get_tar_download(target_dir);
673                info!(
674                    "Preparing to download these files:\n{:?}",
675                    [&fasta, &genbank, &gfa, &gff, &gtf, &bed, &tar]
676                );
677                let files = [fasta, genbank, gfa, gff, gtf, bed, tar]
678                    .into_iter()
679                    .flatten()
680                    .collect::<Vec<_>>();
681                (dataset, files)
682            })
683            .collect::<Vec<_>>()
684    }
685
686    /// Downloads a reference dataset from a Project's registry by label, fetching any registered file
687    /// URLs into a target directory.
688    ///
689    /// This method implements the core file downloading functionality in refman. Given a dataset label
690    /// and target directory, it will:
691    /// 1. Verify the dataset exists in the registry
692    /// 2. Extract all registered file URLs for that dataset (FASTA, Genbank, GFA, GFF, GTF, BED)
693    /// 3. Launch concurrent downloads of all files into the target directory
694    /// 4. Handle any download failures or errors
695    ///
696    /// Downloads happen asynchronously and in parallel for improved performance. The method uses
697    /// tokio for async runtime and reqwest for HTTP requests. Files are downloaded maintaining
698    /// their original filenames from the URLs.
699    ///
700    /// Dataset labels must exactly match what is registered (case-sensitive). The target directory
701    /// will be created if it doesn't exist. Existing files in the target directory may be
702    /// overwritten.
703    ///
704    /// This is used to fetch reference data after registering datasets with `register()`.
705    /// For example, after registering genome data with FASTA and GFF URLs, this method would
706    /// concurrently download both files locally.
707    ///
708    /// # Arguments
709    ///
710    /// * `label` - The unique label of the dataset to download, must match what was registered
711    /// * `target_dir` - Directory path where downloaded files should be saved
712    ///
713    /// # Returns
714    ///
715    /// Returns Ok(()) if all downloads complete successfully, or an error if:
716    /// - The dataset label is not found in the registry
717    /// - Any file downloads fail
718    /// - The target directory cannot be accessed/created
719    /// - Other IO or HTTP errors occur
720    ///
721    /// # Errors
722    ///
723    /// This method can return `EntryError::LabelNotFound` if the dataset is not in the registry,
724    /// as well as various IO and HTTP errors wrapped in `anyhow::Error` for failed downloads.
725    ///
726    /// # Panics
727    ///
728    /// This method will panic if:
729    /// - The progress bar style template is invalid
730    /// - Multiple instances simultaneously write to the same shared progress output
731    /// - The download futures report an internal thread failure
732    ///
733    #[allow(clippy::too_many_lines)]
734    pub async fn download_dataset(
735        self,
736        label: Option<&str>,
737        target_dir: PathBuf,
738    ) -> color_eyre::Result<Self> {
739        // make a new reqwest http client that can be shared between threads
740        let shared_client = Client::new();
741
742        // pull in the sets of files to be downloaded
743        let dataset_files: Vec<(RefDataset, Vec<UnvalidatedFile>)> =
744            self.collect_downloads(label, &target_dir);
745
746        // count the downloads
747        let num_to_download = count_downloads(&dataset_files);
748
749        // early return if there's nothing to download
750        if num_to_download == 0 {
751            info!(
752                "All requested files were previously downloaded and still passed checksums, so no downloads will be performed."
753            );
754            return Ok(self);
755        }
756
757        // set up a progress bar based on the number
758        let (mut toplevel_pb, multiprog) = setup_progress_tracking(label, num_to_download);
759
760        // put each download into its own tokio thread, and collect its handle into a vector
761        // that can be polled downstream
762        let dataset_task_handles =
763            submit_download_requests(dataset_files, &shared_client, &target_dir, &multiprog);
764
765        let updated_datasets =
766            update_project_datasets(dataset_task_handles, &mut toplevel_pb).await?;
767
768        // Once all downloads finish, update and finish the overall progress bar.
769        toplevel_pb.finish_with_message(format!(
770            "Done! {num_to_download} files successfully downloaded to {target_dir:?}."
771        ));
772
773        // Update the project and return it
774        let updated_project = self.update_registry(&updated_datasets);
775
776        Ok(updated_project)
777    }
778
779    #[must_use]
780    pub fn update_registry(self, new_datasets: &[RefDataset]) -> Project {
781        // make a hashmap of the old datasets and new datasets we can compare for available updates
782        let old_datasets: HashMap<&str, &RefDataset> = self
783            .datasets()
784            .iter()
785            .map(|dataset| (dataset.label.as_str(), dataset))
786            .collect();
787        let updated_datasets: HashMap<&str, &RefDataset> = new_datasets
788            .iter()
789            .map(|dataset| (dataset.label.as_str(), dataset))
790            .collect();
791
792        // if a key in the old dataset is also in a new dataset, swap in the new data
793        let merged_datasets: Vec<RefDataset> = old_datasets
794            .into_iter()
795            .map(|(label, dataset)| match updated_datasets.get(label) {
796                Some(aha) => (*aha).to_owned(),
797                None => dataset.clone(),
798            })
799            .collect();
800
801        // use Rust's nice struct update syntax to create a new registry
802        let updated_registry = Registry {
803            datasets: merged_datasets,
804            last_modified: Timestamp::now(),
805            ..self.project
806        };
807
808        // return a new instance of the project in functional style
809        Self {
810            project: updated_registry,
811        }
812    }
813
814    /// Removes a dataset from the Project's registry by its label.
815    ///
816    /// This method allows removing individual datasets from a refman Project's registry
817    /// while maintaining the integrity of the remaining datasets. It can be used to:
818    /// - Remove outdated or no longer needed reference datasets
819    /// - Clean up the registry by removing temporary entries
820    /// - Manage the project's dataset collection over time
821    ///
822    /// The method enforces several rules to maintain registry integrity:
823    /// - The label must exactly match an existing dataset (case-sensitive)
824    /// - The registry must maintain at least one dataset after removal
825    /// - Only one dataset can be removed at a time
826    ///
827    /// This complements `register()` an`download_dataset()` in the lifecycle of managing
828    /// reference data. While those methods add and fetch datasets, `remove()` allows
829    /// pruning datasets that are no longer needed.
830    ///
831    /// # Arguments
832    ///
833    /// * `label` - The unique label identifying the dataset to remove from the registry
834    ///
835    /// # Returns
836    ///
837    /// Returns Ok(Project) with the updated Project if removal succeeds, or an
838    /// `EntryError` in the following cases:
839    /// - `EntryError::LabelNotFound` if no dataset matches the provided label
840    /// - `EntryError::FinalEntry` if removing this dataset would empty the registry
841    ///
842    /// The Project instance is consumed and a new instance is returned to maintain
843    /// the builder pattern used throughout the API.
844    ///
845    /// # Errors
846    ///
847    /// This method can return the following errors:
848    /// - `EntryError::LabelNotFound` if the specified label is not in the registry
849    /// - `EntryError::FinalEntry` if removing this dataset would empty the registry
850    ///   entirely (at least one dataset must always remain)
851    ///
852    pub fn remove(mut self, label: &str) -> Result<Self, EntryError> {
853        // make sure the label is in the recorded datasets
854        if self
855            .datasets()
856            .iter()
857            .filter(|dataset| dataset.label == label)
858            .collect::<Vec<&RefDataset>>()
859            .is_empty()
860        {
861            return Err(EntryError::LabelNotFound(label.to_string()));
862        }
863
864        // if it is, filter it out in place
865        self.project
866            .filter_datasets(|dataset| dataset.label != label);
867
868        // return an error if that was the last entry
869        if self.datasets().is_empty() {
870            return Err(EntryError::FinalEntry(label.to_string()));
871        }
872
873        // otherwise, return the mutated project
874        Ok(self)
875    }
876
877    fn print_single_label_data(self, label: &str) {
878        let datasets = self.datasets();
879        let dataset: Vec<_> = datasets
880            .iter()
881            .filter(|dataset| dataset.label == label)
882            .collect();
883        assert_eq!(
884            dataset.len(),
885            1,
886            "No project with the label '{label}' has been registered. Run `refman list` without the label to see which datasets are registered."
887        );
888        let unwrapped_dataset = dataset[0];
889
890        eprintln!("URLs registered for {label}:");
891        eprintln!("--------------------{}", "-".repeat(label.len()));
892        eprintln!(
893            " - FASTA: {}",
894            unwrapped_dataset
895                .fasta
896                .clone()
897                .unwrap_or(DownloadStatus::default())
898        );
899        eprintln!(
900            " - Genbank: {}",
901            unwrapped_dataset
902                .genbank
903                .clone()
904                .unwrap_or(DownloadStatus::default())
905        );
906        eprintln!(
907            " - GFA: {}",
908            unwrapped_dataset
909                .gfa
910                .clone()
911                .unwrap_or(DownloadStatus::default())
912        );
913        eprintln!(
914            " - GFF: {}",
915            unwrapped_dataset
916                .gff
917                .clone()
918                .unwrap_or(DownloadStatus::default())
919        );
920        eprintln!(
921            " - GTF: {}",
922            unwrapped_dataset
923                .gtf
924                .clone()
925                .unwrap_or(DownloadStatus::default())
926        );
927        eprintln!(
928            " - BED: {}",
929            unwrapped_dataset
930                .bed
931                .clone()
932                .unwrap_or(DownloadStatus::default())
933        );
934        eprintln!(
935            " - TAR: {}",
936            unwrapped_dataset
937                .tar
938                .clone()
939                .unwrap_or(DownloadStatus::default())
940        );
941    }
942
943    fn print_all_labels(self) {
944        // print a title field if it has been set
945        let title_field = &self.project.title;
946        if let Some(title) = title_field {
947            info!("Showing available data registered for {title}:");
948        }
949
950        // make a new mutable instance of a pretty table to be appended to
951        let mut pretty_table = Table::new();
952
953        // add the title row
954        pretty_table.add_row(row![
955            "Label", "FASTA", "Genbank", "GFA", "GFF", "GTF", "BED", "TAR",
956        ]);
957
958        // add rows for each dataset
959        let datasets = self.datasets();
960        for dataset in datasets {
961            pretty_table.add_row(row![
962                dataset.label,
963                abbreviate_str(
964                    dataset
965                        .fasta
966                        .clone()
967                        .unwrap_or(DownloadStatus::default())
968                        .url_owned(),
969                    20,
970                    8,
971                    25
972                ),
973                abbreviate_str(
974                    dataset
975                        .genbank
976                        .clone()
977                        .unwrap_or(DownloadStatus::default())
978                        .url_owned(),
979                    20,
980                    8,
981                    25
982                ),
983                abbreviate_str(
984                    dataset
985                        .gfa
986                        .clone()
987                        .unwrap_or(DownloadStatus::default())
988                        .url_owned(),
989                    20,
990                    8,
991                    25
992                ),
993                abbreviate_str(
994                    dataset
995                        .gff
996                        .clone()
997                        .unwrap_or(DownloadStatus::default())
998                        .url_owned(),
999                    20,
1000                    8,
1001                    25
1002                ),
1003                abbreviate_str(
1004                    dataset
1005                        .gtf
1006                        .clone()
1007                        .unwrap_or(DownloadStatus::default())
1008                        .url_owned(),
1009                    20,
1010                    8,
1011                    25
1012                ),
1013                abbreviate_str(
1014                    dataset
1015                        .bed
1016                        .clone()
1017                        .unwrap_or(DownloadStatus::default())
1018                        .url_owned(),
1019                    20,
1020                    8,
1021                    25
1022                ),
1023                abbreviate_str(
1024                    dataset
1025                        .tar
1026                        .clone()
1027                        .unwrap_or(DownloadStatus::default())
1028                        .url_owned(),
1029                    20,
1030                    8,
1031                    25
1032                ),
1033            ]);
1034        }
1035
1036        pretty_table.printstd();
1037    }
1038
1039    /// Pretty prints the currently registered datasets in a tabular format.
1040    ///
1041    /// This method provides a human-readable view of all reference datasets currently registered
1042    /// in the Project. It prints a formatted table showing each dataset's label and any
1043    /// registered file URLs for the supported bioinformatics formats (FASTA, Genbank, GFA,
1044    /// GFF, GTF, BED).
1045    ///
1046    /// The output is formatted as a table with columns for:
1047    /// - Dataset Label
1048    /// - FASTA URL (if registered)
1049    /// - Genbank URL (if registered)
1050    /// - GFA URL (if registered)
1051    /// - GFF URL (if registered)
1052    /// - GTF URL (if registered)
1053    /// - BED URL (if registered)
1054    ///
1055    /// Empty cells indicate that no URL is registered for that file format. If the Project
1056    /// has a title set, it will be displayed above the table.
1057    ///
1058    /// This provides an easy way to:
1059    /// - View all registered datasets at once
1060    /// - Check which file formats are available for each dataset
1061    /// - Verify dataset labels and URLs
1062    /// - Share the current state of your reference data registry
1063    ///
1064    /// The method consumes self as it follows the builder pattern used throughout the API.
1065    /// The actual printing is handled through the prettytable crate for consistent formatting.
1066    ///
1067    /// # Outputs
1068    ///
1069    /// Prints a formatted table to stdout. If the Project has a title, it is printed as a
1070    /// header above the table. Empty values in the table indicate no URL is registered for
1071    /// that format.
1072    ///
1073    /// # Notes
1074    ///
1075    /// The output is meant for human consumption and formatted for readability. For
1076    /// programmatic access to dataset information, use the `datasets()` or `datasets_owned()`
1077    /// methods instead.
1078    ///
1079    /// # Panics
1080    ///
1081    /// This method will panic if:
1082    /// - Multiple datasets with the same label exist in the registry when requesting a specific label
1083    /// - A requested dataset label does not exist when filtering registered datasets
1084    /// - The prettytable crate encounters an error when printing the output table
1085    pub fn prettyprint(self, label: Option<String>) {
1086        // if the user requested a label, just print the information for that label
1087        if let Some(label_str) = label {
1088            self.print_single_label_data(&label_str);
1089            return;
1090        }
1091
1092        // otherwise, print all datasets as a table
1093        self.print_all_labels();
1094    }
1095}
1096
1097#[inline]
1098fn abbreviate_str(s: String, max_chars: usize, head_chars: usize, tail_chars: usize) -> String {
1099    // Count the characters in the string.
1100    let char_count = s.chars().count();
1101
1102    // If the string is not too long, return it unchanged.
1103    if char_count <= max_chars {
1104        return s;
1105    }
1106
1107    // Collect the first `head_chars` characters.
1108    let head: String = s.chars().take(head_chars).collect();
1109
1110    // Collect the last `tail_chars` characters.
1111    let tail: String = s
1112        .chars()
1113        .rev()
1114        .take(tail_chars)
1115        .collect::<String>()
1116        .chars()
1117        .rev()
1118        .collect();
1119
1120    format!("{head}...{tail}")
1121}
1122
1123#[derive(Debug, Serialize, Deserialize, Clone)]
1124struct Registry {
1125    title: Option<String>,
1126    description: Option<String>,
1127    last_modified: Timestamp,
1128    global: bool,
1129    datasets: Vec<RefDataset>,
1130}
1131
1132impl Default for Registry {
1133    fn default() -> Self {
1134        Registry {
1135            title: None,
1136            description: None,
1137            last_modified: Timestamp::now(),
1138            global: false,
1139            datasets: vec![],
1140        }
1141    }
1142}
1143
1144impl Registry {
1145    fn filter_datasets<F>(&mut self, predicate: F)
1146    where
1147        F: FnMut(&RefDataset) -> bool,
1148    {
1149        self.datasets.retain(predicate);
1150    }
1151}
1152
1153/// A configuration struct for customizing how refman interacts with registry files in your filesystem.
1154///
1155/// `RegistryOptions` is the primary way to control where and how refman stores its data. It provides
1156/// methods to:
1157/// - Set custom registry file locations
1158/// - Configure global vs local registry behavior
1159/// - Initialize new registry files
1160/// - Read from and write to existing registries
1161/// - Set project metadata like titles and descriptions
1162///
1163/// The struct resolves registry paths according to the following priority:
1164/// 1. User-specified custom path via `requested_path`
1165/// 2. For global registries (`global = true`):
1166///    - `$REFMAN_HOME/.refman/refman.toml` if `REFMAN_HOME` is set
1167///    - ~/.refman/refman.toml as default global location
1168/// 3. For local registries (`global = false`):
1169///    - ./refman.toml in current directory
1170///
1171/// This flexibility allows refman to support both project-specific local registries for individual
1172/// bioinformatics projects, as well as user-wide global registries for sharing reference data
1173/// between projects.
1174///
1175/// The struct maintains the resolved absolute path to the registry file, along with project
1176/// metadata and the global/local setting. It provides methods to safely initialize new registries
1177/// and read/write registry data while maintaining data integrity.
1178///
1179/// Generally you won't construct this struct directly, but rather obtain it through the Project
1180/// struct's methods which handle the configuration details automatically. However, advanced users
1181/// can use `RegistryOptions` directly for custom registry handling.
1182///
1183/// This is a core struct in refman's architecture, working closely with Project to provide the
1184/// foundational registry management capabilities that the rest of the tool builds upon.
1185pub struct RegistryOptions {
1186    resolved_path: PathBuf,
1187    title: Option<String>,
1188    description: Option<String>,
1189    global: bool,
1190}
1191
1192impl RegistryOptions {
1193    /// Creates a new `RegistryOptions` instance with customized settings for registry file handling.
1194    ///
1195    /// This struct provides granular control over how refman interacts with registry files,
1196    /// determining where they are stored and how they are initialized. It implements the core
1197    /// logic for resolving registry paths according to the following priority:
1198    ///
1199    /// 1. User-specified custom path via `requested_path` parameter
1200    /// 2. For global registries (`global = true`):
1201    ///    - `$REFMAN_HOME/.refman/refman.toml` if `REFMAN_HOME` is set
1202    ///    - ~/.refman/refman.toml as default global location
1203    /// 3. For local registries (`global = false`):
1204    ///    - ./refman.toml in current directory
1205    ///
1206    /// The struct handles all filesystem interactions needed to:
1207    /// - Resolve and validate registry file paths
1208    /// - Create new registry files or directories as needed
1209    /// - Manage environment variables like `REFMAN_HOME`
1210    /// - Initialize registries with project metadata
1211    ///
1212    /// It works closely with the Project struct to provide the foundational registry
1213    /// management capabilities that refman builds upon. While most users will interact
1214    /// with registries through the Project API, this struct allows advanced users to
1215    /// customize registry behavior.
1216    ///
1217    /// The method performs validation to ensure the requested registry location is
1218    /// accessible and can be written to. It handles edge cases like missing directories
1219    /// and environment variables gracefully.
1220    ///
1221    /// # Arguments
1222    ///
1223    /// * `title` - Optional title for the registry/project
1224    /// * `description` - Optional description text
1225    /// * `requested_path` - Optional custom path where the registry should be stored
1226    /// * `global` - Whether this is a global (true) or local (false) registry
1227    ///
1228    /// # Returns
1229    ///
1230    /// Returns Ok(RegistryOptions) if initialization succeeds, or `RegistryError` if:
1231    /// - The requested path is invalid or inaccessible
1232    /// - Required directories cannot be created
1233    /// - Environment variables cannot be set
1234    /// - Other filesystem operations fail
1235    ///
1236    /// # Errors
1237    ///
1238    /// This method can return `RegistryError` variants for various filesystem and
1239    /// environment access failures. The error types provide context about what
1240    /// specifically failed during registry setup.
1241    pub fn try_new(
1242        title: Option<String>,
1243        description: Option<String>,
1244        requested_path: &Option<String>,
1245        global: bool,
1246    ) -> Result<RegistryOptions, RegistryError> {
1247        // If the user requested a path, see if it exists and is accessible, and
1248        // try to make it work
1249        if let Some(possible_path) = requested_path.as_deref() {
1250            let maybe_path = PathBuf::from_str(possible_path).ok();
1251            let resolved_path = resolve_registry_path(maybe_path, global)?;
1252
1253            Ok(Self {
1254                resolved_path,
1255                title,
1256                description,
1257                global,
1258            })
1259        // otherwise, resolve a path with default settings
1260        } else {
1261            let resolved_path = resolve_registry_path(None, global)?;
1262
1263            Ok(Self {
1264                resolved_path,
1265                title,
1266                description,
1267                global,
1268            })
1269        }
1270    }
1271
1272    /// Initializes a new registry file for the Project if one doesn't already exist.
1273    ///
1274    /// This method handles creating and initializing the registry file that stores a
1275    /// Project's reference datasets and metadata. The registry file location is determined
1276    /// by the `RegistryOptions` configuration, following these rules:
1277    ///
1278    /// 1. User-specified custom path if provided to `RegistryOptions::try_new()`
1279    /// 2. For global registries (global = true):
1280    ///    - `$REFMAN_HOME/.refman/refman.toml` if `REFMAN_HOME` is set
1281    ///    - ~/.refman/refman.toml as default global location
1282    /// 3. For local registries (global = false):
1283    ///    - ./refman.toml in current directory
1284    ///
1285    /// The method will:
1286    /// - Create a new refman.toml file if one doesn't exist at the resolved path
1287    /// - Initialize it with provided title and description if specified
1288    /// - Set appropriate global/local flag
1289    /// - Create any necessary parent directories
1290    /// - Handle filesystem permissions and access
1291    ///
1292    /// If a registry file already exists at the target location, the method will
1293    /// log an informational message and take no action, preserving the existing
1294    /// registry data.
1295    ///
1296    /// This is typically called automatically when creating new Projects, but can
1297    /// be called directly for custom registry initialization workflows. The method
1298    /// integrates with refman's overall registry management system to maintain
1299    /// data integrity and consistent state.
1300    ///
1301    /// # Returns
1302    ///
1303    /// Returns Ok(()) if initialization succeeds or registry already exists.
1304    /// Returns `RegistryError` if filesystem operations fail due to permissions,
1305    /// invalid paths, or other IO errors.
1306    ///
1307    /// # Errors
1308    ///
1309    /// Can return `RegistryError` variants for:
1310    /// - Failed file creation
1311    /// - Invalid paths
1312    /// - Insufficient permissions
1313    /// - Filesystem errors
1314    pub fn init(&self) -> Result<(), RegistryError> {
1315        // If a refman.toml doesn't exist, make it and write out the available information
1316        if self.resolved_path.exists() {
1317            info!("A refman registry already exists. Start filling it with `refman register`.");
1318        } else {
1319            let mut new_project =
1320                Project::new(self.title.clone(), self.description.clone(), self.global);
1321            File::create(&self.resolved_path)?;
1322
1323            self.write_registry(&mut new_project)?;
1324            // Otherwise, do nothing except log out that a registry file already exists
1325        }
1326        Ok(())
1327    }
1328
1329    /// Reads and deserializes a registry file into a Project, or initializes a new empty Project.
1330    ///
1331    /// This method handles loading registry data from refman.toml files. It follows these rules:
1332    /// - If no registry file exists at the resolved path, returns a default empty Project
1333    /// - If an empty registry file exists, returns a default empty Project
1334    /// - Otherwise deserializes the TOML file into a Project instance
1335    ///
1336    /// The registry file path is determined by `RegistryOptions` rules, in order:
1337    /// 1. User-specified custom path if provided
1338    /// 2. For global registries (global = true):
1339    ///    - `$REFMAN_HOME/.refman/refman.toml`
1340    ///    - ~/.refman/refman.toml (default)
1341    /// 3. For local registries (global = false):
1342    ///    - ./refman.toml
1343    ///
1344    /// This method is core to refman's persistence layer, allowing Projects to be saved and
1345    /// loaded across sessions. It works in tandem with `write_registry()` to maintain registry
1346    /// state. The registry files store:
1347    /// - Project metadata (title, description)
1348    /// - Dataset entries with labels and file URLs
1349    /// - Last modified timestamp
1350    /// - Global/local status
1351    ///
1352    /// The method handles common edge cases like:
1353    /// - Missing registry files
1354    /// - Empty registry files
1355    /// - Invalid TOML formatting
1356    /// - File access errors
1357    ///
1358    /// This is typically called internally by Project methods that need to load registry
1359    /// state, but can be used directly for custom registry reading workflows.
1360    ///
1361    /// # Returns
1362    ///
1363    /// Returns Ok(Project) containing either:
1364    /// - A deserialized Project from the registry file
1365    /// - A new empty Project if no valid registry exists
1366    ///
1367    /// # Errors
1368    ///
1369    /// Returns `RegistryError` if:
1370    /// - File operations fail (permissions, IO errors)
1371    /// - TOML deserialization fails
1372    /// - Registry path resolution fails
1373    pub fn read_registry(&self) -> Result<Project, RegistryError> {
1374        // To save some effort, first check if the refman.toml exists. If it doesn't,
1375        // just set up a project with default settings and early-return that
1376        if !self.resolved_path.exists() {
1377            let new_project = Project::default();
1378            return Ok(new_project);
1379        }
1380
1381        // Additionally, if a file exists but is empty, pretend it doesn't exist and do
1382        // the same thing as above
1383        if fs::metadata(&self.resolved_path)?.len() == 0 {
1384            let new_project = Project::default();
1385            return Ok(new_project);
1386        }
1387
1388        // If neither of those conditions were met, read and deserialize the TOML
1389        // file into a Project struct and return it
1390        let toml_contents = read_to_string(self.resolved_path.clone())?;
1391        let project: Project = toml::from_str(&toml_contents)?;
1392        Ok(project)
1393    }
1394    /// Writes a Project's registry data to the refman.toml file at the resolved registry path.
1395    ///
1396    /// This method handles persisting Project state to disk, including:
1397    /// - All registered datasets with their labels and file URLs
1398    /// - Project metadata like title and description
1399    /// - Last modified timestamp
1400    /// - Global/local registry status
1401    ///
1402    /// The registry file location follows `RegistryOptions` rules, in order:
1403    /// 1. User-specified custom path if provided
1404    /// 2. For global registries (global = true):
1405    ///    - `$REFMAN_HOME/.refman/refman.toml`
1406    ///    - ~/.refman/refman.toml (default)
1407    /// 3. For local registries (global = false):
1408    ///    - ./refman.toml
1409    ///
1410    /// This method works in tandem with `read_registry()` to maintain persistent state
1411    /// across refman sessions. When writing, it:
1412    /// - Updates the last modified timestamp
1413    /// - Serializes the Project data to TOML format
1414    /// - Writes the TOML to the resolved registry path
1415    /// - Creates/overwrites the registry file as needed
1416    ///
1417    /// This is typically called internally by Project methods that modify state, but
1418    /// can be used directly for custom registry writing workflows. The method integrates
1419    /// with refman's overall registry management system to maintain data integrity.
1420    ///
1421    /// # Arguments
1422    ///
1423    /// * `project` - Mutable reference to the Project whose state should be written
1424    ///
1425    /// # Returns
1426    ///
1427    /// Returns Ok(()) if the write succeeds, or `RegistryError` if filesystem operations fail.
1428    ///
1429    /// # Errors
1430    ///
1431    /// Returns `RegistryError` if:
1432    /// - File operations fail (permissions, IO errors)
1433    /// - TOML serialization fails
1434    /// - Registry path resolution fails
1435    ///
1436    /// # Panics
1437    ///
1438    /// This method does not panic under normal circumstances, but may panic if the filesystem
1439    /// becomes inaccessible while writing or if memory allocation fails during serialization.
1440    pub fn write_registry(&self, project: &mut Project) -> Result<(), RegistryError> {
1441        // update the timestamp
1442        project.project.last_modified = Timestamp::now();
1443
1444        // serialize and write out the TOML file
1445        let toml_text = toml::to_string_pretty(project)?;
1446        fs::write(&self.resolved_path, toml_text)?;
1447
1448        Ok(())
1449    }
1450}
1451
1452fn resolve_registry_path(
1453    maybe_path: Option<PathBuf>,
1454    global: bool,
1455) -> Result<PathBuf, RegistryError> {
1456    // to resolve a registry path, a fair amount of control flow needs to happen to unwrap a few conditions.
1457    // First, we prioritize a directory the user requests we place the registry in, if provided. This is the simplest
1458    // branch and comes first.
1459    let registry_path = match maybe_path {
1460        Some(valid_path) => {
1461            if let Some(path_str) = valid_path.to_str() {
1462                debug!("Setting the refman home to '{path_str}'");
1463                set_refman_home(path_str);
1464            }
1465            valid_path.join("refman.toml")
1466        },
1467
1468        // If the user did not request a particular directory, we then check if a global registry was requested.
1469        // If not, this is the next simplest case; just place the registry in the current working directory (ideally,
1470        // the project root).
1471        None => {
1472            // If not global, use the current directory as the refman home and return the full path.
1473            if !global {
1474                let current_dir = current_dir()?;
1475                if let Some(current_dir_string) = current_dir.to_str() {
1476                    debug!("Setting the refman home to '{current_dir_string}'");
1477                    set_refman_home(current_dir_string);
1478                }
1479
1480                return Ok(current_dir.join("refman.toml"));
1481            }
1482
1483            // If no desired directory was provided, but the user also requested that the registry is global, first
1484            // check the environment variable REFMAN_HOME for the registry's location.
1485            let refman_home: Option<PathBuf> = match env::var("REFMAN_HOME") {
1486                Ok(path_str) => {
1487                    debug!(
1488                        "Desired file path detected in the REFMAN_HOME environment variable: '{}'. A global registry will be placed there.",
1489                        path_str
1490                    );
1491                    let path = PathBuf::from(path_str);
1492                    Some(path)
1493                },
1494                // If that environment variable isn't set, place it in the home directory.
1495                Err(_) => {
1496                    debug!(
1497                        "The REFMAN_HOME variable is not set. The registry will thus be placed in its default location in the user's home directory."
1498                    );
1499                    dirs::home_dir()
1500                },
1501            };
1502
1503            // Finally, whether the home directory is being used or the current directory as a fallback, join on
1504            // a subdirectory called ".refman" and then "refman.toml" onto that.
1505            if let Some(dir) = refman_home {
1506                let resolved_home = dir.join(".refman");
1507                debug!("setting the refman home to '{:?}'", resolved_home);
1508                resolved_home
1509            } else {
1510                warn!("unable to access home directory, so `refman `will place its registry in the current working directory. unless this path is provided in the next `refman` run, `refman` may be unable to pick up where it leaves off during the current run.");
1511                let current_dir = current_dir()?;
1512                if let Some(current_dir_string) = current_dir.to_str() {
1513                    debug!("setting the refman home to '{current_dir_string}'");
1514                    set_refman_home(current_dir_string);
1515                }
1516                let resolved_home = current_dir.join(".refman");
1517                debug!("setting the refman home to '{:?}'", resolved_home);
1518                resolved_home
1519            }.join("refman.toml")
1520        }, // TODO: Eventually, it would be cool to have a global dotfile config for refman so the user doesn't have
1521           // to tell it to operate globally every time.
1522    };
1523
1524    Ok(registry_path)
1525}
1526
1527fn set_refman_home(desired_dir: &str) {
1528    // If REFMAN_HOME is set,
1529    if let Ok(old_home) = env::var("REFMAN_HOME") {
1530        warn!(
1531            "The environment variable $REFMAN_HOME was previously set to {}, but a new location at {} was requested. `refman` will overwrite the old $REFMAN_HOME value and proceed.",
1532            old_home, desired_dir
1533        );
1534        unsafe { env::set_var("REFMAN_HOME", desired_dir) }
1535    } else {
1536        debug!(
1537            "The REFMAN_HOME environment variable has not previously been set. Now setting it to the requested directory, {}",
1538            desired_dir
1539        );
1540        unsafe { env::set_var("REFMAN_HOME", desired_dir) }
1541    }
1542}
1543
1544fn is_likely_url(url: &str) -> bool {
1545    url.starts_with("http") || url.starts_with("ftp") || url.starts_with("sftp")
1546}
1547
1548#[inline]
1549fn count_downloads(dataset_files: &[(RefDataset, Vec<UnvalidatedFile>)]) -> usize {
1550    // count the files to generate a message to inform the user of what will be downloaded
1551    let mut num_to_download = 0;
1552    for (_, files) in dataset_files {
1553        num_to_download += files.len();
1554    }
1555    info!("{num_to_download} downloads are confirmed. Proceeding...");
1556    num_to_download
1557}
1558
1559#[allow(clippy::expect_used)]
1560fn setup_progress_tracking(
1561    label: Option<&str>,
1562    num_to_download: usize,
1563) -> (ProgressBar, Arc<MultiProgress>) {
1564    // generate a message based on whether a particular dataset was requested as well as on the number
1565    // of files to be downloaded.
1566    let message = match label {
1567        Some(label_str) => {
1568            format!("Downloading {num_to_download} files for project labeled '{label_str}'...")
1569        },
1570        None => format!("Downloading all {num_to_download} files listed in the refman registry..."),
1571    };
1572
1573    // Create a shared MultiProgress container.
1574    let multi_pb = Arc::new(MultiProgress::new());
1575
1576    // Create a top-level progress bar with total length equal to the number of files, and set its starting message
1577    // with the message computed above
1578    let toplevel_pb = multi_pb.add(ProgressBar::new(num_to_download as u64));
1579    toplevel_pb.set_style(
1580        ProgressStyle::default_bar()
1581            .template("{msg} [{bar:40.cyan/blue}] {pos}/{len} ({eta})")
1582            .expect("Failed to set template"),
1583    );
1584    toplevel_pb.set_message(message);
1585
1586    // return a raw tuple containing the number to download, the top-level progress bar, and the per-file
1587    // progress bar
1588    (toplevel_pb, multi_pb)
1589}
1590
1591fn submit_download_requests(
1592    dataset_files: Vec<(RefDataset, Vec<UnvalidatedFile>)>,
1593    shared_client: &Client,
1594    target_dir: &Path,
1595    mp: &Arc<MultiProgress>,
1596) -> Vec<JoinHandle<Result<(RefDataset, MultiDownloadResults), ColorError>>> {
1597    // count the number of files to download
1598    let num_to_download = dataset_files.len();
1599
1600    // use that count to initialize a vector of exactly the right length, each item of which will be a deeply
1601    // nested result of vectors of results. This is because tasks will be spawned at two levels: one task per
1602    // request `RefDataset`, and all the registered files per `RefDataset`.
1603    let mut dataset_task_handles: Vec<
1604        JoinHandle<Result<(RefDataset, MultiDownloadResults), ColorError>>,
1605    > = Vec::with_capacity(num_to_download);
1606
1607    // Go through each dataset and its registered files and request them. This design can be thought of somewhat like
1608    // actors, where each dataset task supervises each file download task
1609    for (dataset, files) in dataset_files {
1610        let shared_client = shared_client.clone();
1611        let mp = mp.clone();
1612        let target_dir = Arc::new(target_dir.to_path_buf());
1613
1614        // Spawn a task per dataset
1615        let handle: JoinHandle<_> = tokio::spawn(async move {
1616            // Inside this task: spawn parallel tasks for each file
1617            let file_task_handles = files.into_iter().map(|file| {
1618                let client = shared_client.clone();
1619                let dir = target_dir.clone();
1620                let mp = mp.clone();
1621
1622                tokio::spawn(async move { request_dataset(file, client, dir, mp).await })
1623            });
1624
1625            // Await all file download tasks for this dataset
1626            let file_results = try_join_all(file_task_handles).await?;
1627
1628            Ok((dataset, file_results))
1629        });
1630
1631        // collect all the dataset "supervisor" tasks
1632        dataset_task_handles.push(handle);
1633    }
1634
1635    dataset_task_handles
1636}
1637
1638async fn update_project_datasets(
1639    dataset_task_handles: Vec<JoinHandle<Result<(RefDataset, MultiDownloadResults), ColorError>>>,
1640    toplevel_pb: &mut ProgressBar,
1641) -> color_eyre::Result<Vec<RefDataset>> {
1642    // await all tasks in all threads, collecting them into a vec after the transformations below
1643    let updated_datasets: Vec<RefDataset> = try_join_all(dataset_task_handles)
1644        .await?
1645        // no need to reference each item; consuming them is fine here
1646        .into_iter()
1647        // take each attempted download, and, if successful, increment the progress bar, and then keep the
1648        // successful unvalidated downloads in a vector
1649        .filter_map(|dataset_result| {
1650            toplevel_pb.inc(1);
1651            match dataset_result {
1652                Ok((dataset, file_results)) => {
1653                    match file_results.into_iter().collect::<Result<Vec<_>, _>>() {
1654                        Ok(successful_files) => Some((dataset, successful_files)),
1655                        Err(msg) => {
1656                            warn!("Failed to download files because of this error: {}", msg);
1657                            None
1658                        }
1659                    }
1660                }
1661                Err(msg) => {
1662                    warn!("Failed to download files because of this error: {}", msg);
1663                    None
1664                }
1665            }
1666        })
1667        // now use each successful download to update its associated dataset, returning an owned updated dataset or
1668        // a validation error (the update performs validation under the hood)
1669        .map(
1670            |(mut dataset, files)| -> Result<RefDataset, ValidationError> {
1671                for file in files {
1672                    dataset.update_with_download(&file)?;
1673                }
1674                Ok(dataset)
1675            },
1676        )
1677        .collect::<Result<Vec<RefDataset>, ValidationError>>()?;
1678
1679    // return the vector of updated `RefDataset` instances
1680    Ok(updated_datasets)
1681}
1682
1683#[cfg(test)]
1684mod tests {
1685    #![allow(clippy::expect_used, clippy::unwrap_used)]
1686
1687    use super::*;
1688    use tempfile::tempdir;
1689
1690    #[test]
1691    fn test_new_project() {
1692        let title = Some("Test Project".to_string());
1693        let desc = Some("A test project".to_string());
1694        let project = Project::new(title.clone(), desc.clone(), false);
1695
1696        assert_eq!(project.project.title, title);
1697        assert_eq!(project.project.description, desc);
1698        assert!(!project.project.global);
1699        assert!(project.project.datasets.is_empty());
1700    }
1701
1702    #[test]
1703    fn test_registry_options_new() {
1704        let temp_dir = tempdir().unwrap();
1705        let dir_path = temp_dir.path().to_str().unwrap();
1706
1707        let options = RegistryOptions::try_new(
1708            Some("Test Registry".to_string()),
1709            Some("Test Description".to_string()),
1710            &Some(dir_path.to_string()),
1711            false,
1712        )
1713        .unwrap();
1714
1715        assert_eq!(
1716            options.resolved_path,
1717            PathBuf::from(dir_path).join("refman.toml")
1718        );
1719        assert_eq!(options.title, Some("Test Registry".to_string()));
1720        assert_eq!(options.description, Some("Test Description".to_string()));
1721        assert!(!options.global);
1722    }
1723
1724    #[test]
1725    fn test_read_write_registry() {
1726        let temp_dir = tempdir().unwrap();
1727        let dir_path = temp_dir.path().to_str().unwrap();
1728
1729        let options =
1730            RegistryOptions::try_new(None, None, &Some(dir_path.to_string()), false).unwrap();
1731
1732        // Test writing
1733        let mut project = Project::new(None, None, false);
1734        options.write_registry(&mut project).unwrap();
1735        assert!(options.resolved_path.exists());
1736
1737        // Test reading
1738        let read_project = options.read_registry().unwrap();
1739        assert_eq!(read_project.datasets().len(), 0);
1740    }
1741}
refman/project.rs

refman/
project.rs