refman/project.rs
1use std::{
2 collections::HashMap,
3 env::{self, current_dir},
4 fs::{self, File, read_to_string},
5 path::{Path, PathBuf},
6 str::FromStr,
7 sync::Arc,
8};
9
10use color_eyre::eyre::{Error as ColorError, eyre};
11use futures::future::try_join_all;
12use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
13use jiff::Timestamp;
14use log::{debug, info, warn};
15use prettytable::{Table, row};
16use reqwest::Client;
17use serde::{Deserialize, Serialize};
18use tokio::task::JoinHandle;
19
20use crate::{
21 EntryError, RegistryError, ValidationError,
22 data::{DownloadStatus, RefDataset},
23 downloads::{check_url, request_dataset},
24 validate::UnvalidatedFile,
25};
26
27/// A reference manager for all data associated with your bioinformatics project.
28///
29/// Projects are the top-level abstraction in refman, allowing you to register, track,
30/// download, and manage reference files like FASTA, Genbank, GFA, GFF, GTF and BED files
31/// for your bioinformatics work. A Project maintains a registry of datasets, where each dataset
32/// has a unique label and can contain references to multiple file types.
33///
34/// The Project struct provides methods to:
35/// - Initialize new reference management projects
36/// - Register new datasets or update existing ones
37/// - Download registered datasets from remote URLs
38/// - Remove datasets from the registry
39/// - Pretty print the current state of registered datasets
40///
41/// Projects can be either local (stored in ./refman.toml) or global (stored in ~/.refman/refman.toml).
42/// The registry location can also be customized via the `REFMAN_HOME` environment variable.
43///
44/// Each dataset in a project is tracked with a unique label and can contain optional URLs pointing
45/// to reference files in standard bioinformatics formats (FASTA, Genbank, GFA, GFF, GTF, BED).
46/// The registry maintains metadata like when it was last modified and optional title/description fields.
47///
48/// # Examples
49///
50/// ```no_run
51/// # use refman::project::Project;
52/// // Create a new local project
53/// let project = Project::new(
54/// Some("My Assembly Project".to_string()),
55/// Some("Reference data for genome assembly".to_string()),
56/// false
57/// );
58/// ```
59///
60/// The Project struct integrates with other refman types like `RefDataset` for managing individual
61/// reference datasets and `RegistryOptions` for configuring where and how the registry is stored.
62#[derive(Debug, Default, Serialize, Deserialize, Clone)]
63pub struct Project {
64 project: Registry,
65}
66
67type MultiDownloadResults = Vec<Result<UnvalidatedFile, ColorError>>;
68
69impl Project {
70 /// Creates a new Project struct with optional title and description strings and
71 /// a boolean flag controlling if the project's registry file is placed in a
72 /// global location (`REFMAN_HOME`or ~/.refman) or locally (./refman.toml).
73 ///
74 /// A Project is the top-level struct for managing reference data in refman. It
75 /// maintains a registry of reference genomics datasets, where each dataset can
76 /// include references to multiple standard bioinformatics file formats like
77 /// FASTA, Genbank, GFA, GFF, GTF and BED files. The registry stores metadata
78 /// about each reference dataset including when it was last modified.
79 ///
80 /// The registry file location depends on the `global` parameter:
81 /// - If `global=false` (default), creates a local refman.toml in current directory
82 /// - If `global=true`, uses either `$REFMAN_HOME/.refman/refman.toml` or ~/.refman/refman.toml
83 ///
84 /// # Arguments
85 ///
86 /// * `title` - Optional title for the project
87 /// * `description` - Optional description of the project
88 /// * `global` - Whether to store the registry file globally or locally
89 ///
90 /// # Returns
91 ///
92 /// Returns a new Project instance initialized with the provided title, description
93 /// and global flag. The internal Registry is created with default values for
94 /// `last_modified` timestamp and an empty datasets vector.
95 fn new(title: Option<String>, description: Option<String>, global: bool) -> Self {
96 // fill in any user provided title, description, or global information on
97 // top of the information stored in a project by default
98 let registry = Registry {
99 title,
100 description,
101 global,
102 ..Registry::default()
103 };
104
105 Self { project: registry }
106 }
107
108 /// Returns a read-only slice of all reference datasets currently registered in the project.
109 ///
110 /// This method provides access to the raw collection of `RefDataset` entries stored in the
111 /// project's registry. Each `RefDataset` represents a labeled collection of bioinformatics
112 /// reference files, potentially including FASTA, Genbank, GFA, GFF, GTF and BED formats.
113 ///
114 /// This accessor is useful for:
115 /// - Inspecting the currently registered datasets without modifying them
116 /// - Iterating over registered datasets to check their properties
117 /// - Filtering datasets based on custom criteria
118 /// - Accessing individual dataset labels and file URLs
119 ///
120 /// The returned slice allows read-only access to ensure the registry's integrity is maintained.
121 /// For mutable access, use `datasets_mut()` instead. For taking ownership of the datasets,
122 /// use `datasets_owned()`.
123 ///
124 /// # Returns
125 ///
126 /// A read-only slice containing all registered `RefDataset` entries in the project.
127 /// Returns an empty slice if no datasets are registered.
128 #[inline]
129 #[must_use]
130 pub fn datasets(&self) -> &[RefDataset] {
131 self.project.datasets.as_slice()
132 }
133
134 /// Returns a mutable slice of all reference datasets registered in the project.
135 ///
136 /// This method provides mutable access to the raw collection of `RefDataset` entries stored in
137 /// the project's registry. Each `RefDataset` represents a labeled collection of bioinformatics
138 /// reference files, potentially including FASTA, Genbank, GFA, GFF, GTF and BED formats.
139 ///
140 /// Mutable access allows modifying existing datasets, for example to:
141 /// - Update file URLs for existing datasets
142 /// - Modify dataset labels or other metadata
143 /// - Add or remove file references from datasets
144 /// - Reorder datasets within the registry
145 ///
146 /// Use this method with caution as it allows direct mutation of the registry state.
147 /// For read-only access, use `datasets()` instead. To take ownership of the datasets,
148 /// use `datasets_owned()`.
149 ///
150 /// # Returns
151 ///
152 /// A mutable slice containing all registered `RefDataset` entries in the project.
153 /// Returns an empty slice if no datasets are registered.
154 #[inline]
155 pub fn datasets_mut(&mut self) -> &mut [RefDataset] {
156 self.project.datasets.as_mut_slice()
157 }
158
159 /// Takes ownership of all reference datasets registered in the project.
160 ///
161 /// This method provides a way to take ownership of the raw collection of `RefDataset` entries
162 /// stored in the project's registry, consuming the project in the process. Each `RefDataset`
163 /// represents a labeled collection of bioinformatics reference files, potentially including
164 /// FASTA, Genbank, GFA, GFF, GTF and BED formats.
165 ///
166 /// Taking ownership via `datasets_owned()` allows:
167 /// - Moving datasets out of the Project context entirely
168 /// - Transferring datasets between Projects
169 /// - Performing owned operations on datasets that require ownership
170 /// - Converting datasets into other data structures
171 ///
172 /// This is different from `datasets()` which provides read-only access an`datasets_mut()`
173 /// which provides mutable access but keeps ownership within the Project. Using `datasets_owned()`
174 /// consumes the Project instance.
175 ///
176 /// # Returns
177 ///
178 /// A Vec containing all registered `RefDataset` entries, transferring ownership from
179 /// the Project to the caller. Returns an empty Vec if no datasets were registered.
180 /// The Project instance is consumed in the process.
181 #[inline]
182 #[must_use]
183 pub fn datasets_owned(self) -> Vec<RefDataset> {
184 self.project.datasets
185 }
186
187 /// Returns a reference to a specific dataset from the Project's registry by its label.
188 ///
189 /// This method provides direct access to individual reference datasets stored in the project's
190 /// registry. It takes a label string and returns a reference to the matching `RefDataset` if one
191 /// exists. Each dataset in a refman Project has a unique label that identifies it, containing
192 /// optional references to various bioinformatics file formats (FASTA, Genbank, GFA, GFF, GTF, BED).
193 ///
194 /// The method enforces that:
195 /// - The label must exactly match a registered dataset (case-sensitive)
196 /// - Only one dataset can have a given label (unique key constraint)
197 /// - The dataset must exist in the registry
198 ///
199 /// This is commonly used to:
200 /// - Check details of specific registered datasets
201 /// - Access dataset file URLs before downloading
202 /// - Verify dataset registration status
203 /// - Extract dataset metadata
204 ///
205 /// The method complements other Project methods like `register()` an`download_dataset()`() in the
206 /// dataset management lifecycle. While those methods add and fetch datasets, `get_dataset()`
207 /// provides read access to verify and inspect registered data.
208 ///
209 /// # Arguments
210 ///
211 /// * `label` - The unique label identifying the dataset to retrieve
212 ///
213 /// # Returns
214 ///
215 /// Returns Ok(&RefDataset) with a reference to the matching dataset if found.
216 /// Returns `EntryError::LabelNotFound` if no dataset matches the provided label.
217 ///
218 /// # Errors
219 ///
220 /// Can return `EntryError::LabelNotFound` if the requested dataset label is not
221 /// registered in the project.
222 ///
223 /// # Panics
224 ///
225 /// This method will panic if:
226 /// - More than one dataset with the same label exists in the registry
227 /// (indicates invalid state as labels must be unique)
228 /// - The filtered dataset collection contains an unexpected number of matches
229 /// (should be exactly 1 match for a valid label)
230 #[inline]
231 pub fn get_dataset(&self, label: &str) -> Result<&RefDataset, EntryError> {
232 // pull in a read-only slice of the datasets currently in project state
233 let datasets = self.datasets();
234
235 // If a dataset isn't in the current project state, return a refman error
236 // wrapped in an anyhow error.
237 if datasets
238 .iter()
239 .map(|dataset| dataset.label.as_str())
240 .filter(|ds_label| *ds_label == label)
241 .collect::<Vec<&str>>()
242 .is_empty()
243 {
244 Err(EntryError::LabelNotFound(label.to_string()))?;
245 }
246
247 // make sure only one dataset matches the provided label, which must be a unique
248 // key
249 let entry: Vec<_> = datasets
250 .iter()
251 .filter(|dataset| dataset.label == label)
252 .collect();
253 assert_eq!(entry.len(), 1);
254
255 Ok(entry[0])
256 }
257
258 /// Returns a vector of all registered file URLs for a dataset with the given label.
259 ///
260 /// This method provides access to all file URLs registered for a dataset, combining any valid URLs
261 /// across the supported bioinformatics file formats (FASTA, Genbank, GFA, GFF, GTF, BED). The URLs
262 /// can then be used to download reference files, validate dataset completeness, or inspect available
263 /// file formats.
264 ///
265 /// The method will:
266 /// - Verify the dataset exists by the given label
267 /// - Extract all non-None URLs registered for that dataset
268 /// - Return them as a vector in a consistent order (FASTA, Genbank, etc.)
269 ///
270 /// This complements other dataset access methods by providing URL-specific functionality. While
271 /// `get_dataset()` returns the full dataset struct an`download_dataset()` handles file fetching,
272 /// `get_dataset_urls()` focuses specifically on URL access and validation.
273 ///
274 /// The method is used internally by `download_dataset()` to determine which files to fetch, but can
275 /// also be used directly to:
276 /// - Preview what files are available before downloading
277 /// - Extract URLs for custom download handling
278 /// - Verify dataset completeness
279 /// - Share dataset URLs
280 ///
281 /// # Arguments
282 ///
283 /// * `label` - The unique label identifying the dataset whose URLs should be retrieved
284 ///
285 /// # Returns
286 ///
287 /// Returns Ok(Vec<String>) containing all non-None URLs registered for the dataset.
288 /// Returns an empty vector if the dataset exists but has no URLs registered.
289 /// Returns `EntryError::LabelNotFound` if no dataset matches the provided label.
290 ///
291 /// # Errors
292 ///
293 /// Can return `EntryError::LabelNotFound` if the requested dataset label is not in the registry.
294 #[inline]
295 pub fn get_dataset_urls(&self, label: &str) -> Result<Vec<String>, EntryError> {
296 // access the dataset for the provided label
297 let dataset = self.get_dataset(label)?;
298
299 // build a vector based on the URLs that may or may not be available for downloading
300 let urls = vec![
301 dataset.fasta.clone(),
302 dataset.genbank.clone(),
303 dataset.gfa.clone(),
304 dataset.gff.clone(),
305 dataset.gtf.clone(),
306 dataset.bed.clone(),
307 ]
308 .into_iter()
309 .flatten()
310 .map(|download| download.url_owned())
311 .collect::<Vec<String>>();
312
313 Ok(urls)
314 }
315
316 /// Returns a vector of URLs for all reference data across all registered datasets.
317 ///
318 /// This method provides access to all file URLs registered in the project's datasets,
319 /// aggregating URLs from each dataset and each supported bioinformatics file format
320 /// (FASTA, Genbank, GFA, GFF, GTF, BED). It is useful for:
321 /// - Getting an overview of all reference data in the project
322 /// - Batch downloading all registered files
323 /// - Validating URLs across the entire registry
324 /// - Sharing/exporting full URL lists
325 ///
326 /// The method processes each dataset sequentially, collecting any non-None URLs into
327 /// a single vector. URLs are gathered in a consistent order per dataset:
328 /// FASTA -> Genbank -> GFA -> GFF -> GTF -> BED.
329 ///
330 /// Unlike `get_dataset_urls()` which operates on a single labeled dataset, this method
331 /// provides complete URL access across the entire registry. It complements other Project
332 /// methods like `download_dataset()` by enabling bulk operations across all reference data.
333 ///
334 /// The method enforces URL validity by checking that:
335 /// - No empty URLs are included
336 /// - All URLs use either http:// or https:// protocols
337 ///
338 /// # Returns
339 ///
340 /// Returns Ok(Vec<String>) containing all valid URLs across all datasets.
341 /// Returns an empty vector if no URLs are registered.
342 ///
343 /// # Errors
344 ///
345 /// Can return `EntryError` variants if:
346 /// - Dataset access fails
347 /// - URL validation fails
348 /// - Project state is invalid
349 ///
350 /// # Panics
351 ///
352 /// This method will panic if:
353 /// - Empty URLs are found in datasets (invalid state)
354 /// - URLs with invalid protocols are found (must be http/https)
355 #[inline]
356 pub fn get_all_urls(&self) -> Result<Vec<String>, EntryError> {
357 // access the dataset for the provided label
358 let datasets = self.datasets();
359
360 // build a vector based on the URLs that may or may not be available for downloading
361 let mut all_urls = Vec::new();
362 for dataset in datasets {
363 let urls = vec![
364 dataset.fasta.clone(),
365 dataset.genbank.clone(),
366 dataset.gfa.clone(),
367 dataset.gff.clone(),
368 dataset.gtf.clone(),
369 dataset.bed.clone(),
370 ]
371 .into_iter()
372 .flatten()
373 .map(|download| download.url_owned())
374 .collect::<Vec<String>>();
375 all_urls.extend(urls);
376 }
377 assert!(
378 all_urls.iter().all(|url| !url.is_empty()),
379 "Found empty URLs in dataset"
380 );
381 assert!(
382 all_urls
383 .iter()
384 .all(|url| url.starts_with("http://") || url.starts_with("https://")),
385 "Found invalid URL protocols"
386 );
387
388 Ok(all_urls)
389 }
390
391 /// Checks if a dataset with a given label is registered in the project.
392 ///
393 /// This method searches through the project's registry to determine if a dataset
394 /// with the specified label exists. Each dataset in a refman Project must have a
395 /// unique label that identifies it - this label acts as the primary key for the
396 /// dataset within the registry.
397 ///
398 /// This method is useful for:
399 /// - Validating labels before attempting to register or update datasets
400 /// - Checking existence of specific datasets before trying to download them
401 /// - General queries about what data is available in the project
402 ///
403 /// The check is case-sensitive - "genome" and "Genome" are considered different labels.
404 /// Labels must be unique within a project's registry.
405 ///
406 /// # Arguments
407 ///
408 /// * `label` - The label string to search for in the registry
409 ///
410 /// # Returns
411 ///
412 /// Returns `true` if a dataset with the given label exists in the registry,
413 /// `false` otherwise. Note that this only checks for label existence, not whether
414 /// the dataset has any file URLs registered or if those files are accessible.
415 #[must_use]
416 pub fn is_registered(&self, label: &str) -> bool {
417 // Iterate through a slice of the available datasets, keeping only the dataset
418 // with a label matching what the user has requested. Return true if the result
419 // is not empty and false if it is.
420 !self
421 .datasets()
422 .iter()
423 .filter(|dataset| dataset.label == label)
424 .collect::<Vec<&RefDataset>>()
425 .is_empty()
426 }
427
428 /// Registers a new dataset or updates an existing dataset in the Project's registry.
429 ///
430 /// This is one of the core methods for managing reference data in refman. It takes a `RefDataset`
431 /// struct containing a unique label and optional URLs for various bioinformatics file formats
432 /// (FASTA, Genbank, GFA, GFF, GTF, BED, TAR) and either:
433 ///
434 /// - Adds it as a new dataset if the label doesn't exist in the registry yet
435 /// - Updates an existing dataset with any new URLs provided if the label matches
436 ///
437 /// When updating an existing dataset, only fields that are Some(url) in the new `RefDataset`
438 /// will overwrite the existing dataset's fields. This allows for incremental updates where
439 /// you can add new file references to a dataset over time without having to re-specify
440 /// existing URLs.
441 ///
442 /// The registry enforces that dataset labels must be unique - you cannot have two datasets
443 /// with the same label. This allows the label to act as a primary key for looking up and
444 /// managing datasets within the project.
445 ///
446 /// # Arguments
447 ///
448 /// * `new_dataset` - A `RefDataset` struct containing the label and optional file URLs to
449 /// register or update. The label field is required and must be unique within the registry.
450 ///
451 /// # Returns
452 ///
453 /// Returns Ok(Project) with the updated Project if registration succeeds, or an `EntryError`
454 /// if there are issues with the dataset registration (e.g. invalid state detected).
455 ///
456 /// # Examples
457 ///
458 /// To register a new dataset:
459 /// ```rust,no_run
460 /// # use refman::{project::Project, data::RefDataset};
461 /// let mut project = Project::new(None, None, false);
462 /// let dataset = RefDataset {
463 /// label: "genome".into(),
464 /// fasta: Some("https://example.com/genome.fasta".into()),
465 /// ..Default::default()
466 /// };
467 /// project = project.register(dataset).unwrap();
468 /// ```
469 ///
470 /// The registration process will either add this as a new dataset if "genome" is not yet
471 /// registered, or update the existing "genome" dataset with the new FASTA URL if it exists.
472 ///
473 /// # Errors
474 ///
475 /// This method can return several types of errors:
476 /// - `EntryError::LabelNotFound` if the dataset being registered cannot be found during updates
477 /// - `EntryError::FinalEntry` if registering this dataset would leave the registry empty
478 /// - Filesystem errors from reading/writing the registry file
479 /// - Serialization errors when encoding/decoding the registry TOML
480 /// - Permission errors when accessing registry files
481 /// - IO errors if registry files or directories cannot be accessed
482 /// - Environment variable errors if `REFMAN_HOME` is invalid
483 /// - Path resolution errors for invalid registry paths
484 ///
485 /// # Panics
486 ///
487 /// This method will panic if multiple datasets matching the given label are found in
488 /// the registry. This should never happen as labels must be unique, but represents an
489 /// invalid state that requires immediate attention.
490 ///
491 pub async fn register(mut self, new_dataset: RefDataset) -> Result<Self, EntryError> {
492 let Some(dataset_match_idx) = self.get_dataset_idx(&new_dataset.label) else {
493 // if the label wasn't found, it's not in the registry, so it can be safely
494 // appended without any fear of duplication
495 self.project.datasets.push(new_dataset);
496 return Ok(self);
497 };
498
499 // pull in a mutable reference to the slice of datasets, get a mutable reference to the one
500 // dataset we need to update (using the index), and then update each of it's fields if the
501 // user provided values for them.
502 let previous_datasets = self.datasets_mut();
503 let dataset_to_update = &mut previous_datasets[dataset_match_idx];
504
505 // use pattern matching here to get exhaustiveness checking instead of if-else
506 match new_dataset {
507 // if it's a FASTA, make sure the link points to a resource that exists and then update
508 // the registry with it
509 RefDataset {
510 fasta: Some(ref fasta),
511 ..
512 } => {
513 // TODO: All checkes here, including whether a URI is likely a URL, whether that URL is valid, and
514 // whether a local path exists, could be included in a guard, relegating the single unhappy path
515 // where a provided URI is not a URL that exists and is not a path that exists, to a single branch arm
516 // for all file types.
517 let url_str = fasta.url();
518 if is_likely_url(url_str) {
519 let _ = check_url(url_str).await?;
520 } else if !PathBuf::from(url_str).is_file() {
521 return Err(EntryError::InvalidURL(eyre!(
522 "The provided uri {url_str} was not a web link, nor was it a local file path pointing to something that exists."
523 )));
524 }
525 dataset_to_update.fasta = new_dataset.fasta;
526 },
527
528 // Do the same thing but with a putative genbank file
529 RefDataset {
530 genbank: Some(ref genbank),
531 ..
532 } => {
533 let url_str = genbank.url();
534 if is_likely_url(url_str) {
535 let _ = check_url(url_str).await?;
536 }
537 dataset_to_update.genbank = new_dataset.genbank;
538 },
539
540 // Do the same thing but with a putative GFA file
541 RefDataset {
542 gfa: Some(ref gfa), ..
543 } => {
544 let url_str = gfa.url();
545 if is_likely_url(url_str) {
546 let _ = check_url(url_str).await?;
547 }
548 dataset_to_update.gfa = new_dataset.gfa;
549 },
550
551 // Do the same thing but with a putative GFF file
552 RefDataset {
553 gff: Some(ref gff), ..
554 } => {
555 let url_str = gff.url();
556 if is_likely_url(url_str) {
557 let _ = check_url(url_str).await?;
558 }
559 dataset_to_update.gff = new_dataset.gff;
560 },
561
562 // Do the same thing but with a putative GTF file
563 RefDataset {
564 gtf: Some(ref gtf), ..
565 } => {
566 let url_str = gtf.url();
567 if is_likely_url(url_str) {
568 let _ = check_url(url_str).await?;
569 }
570 dataset_to_update.gtf = new_dataset.gtf;
571 },
572
573 // Do the same thing but with a putative BED file
574 RefDataset {
575 bed: Some(ref bed), ..
576 } => {
577 let url_str = bed.url();
578 if is_likely_url(url_str) {
579 let _ = check_url(url_str).await?;
580 }
581 dataset_to_update.bed = new_dataset.bed;
582 },
583
584 // Do the same thing but with a putative TAR file
585 RefDataset {
586 tar: Some(ref tar), ..
587 } => {
588 let url_str = tar.url();
589 if is_likely_url(url_str) {
590 let _ = check_url(url_str).await?;
591 }
592 dataset_to_update.tar = new_dataset.tar;
593 },
594
595 // If somehow this state has slipped through the cracks, it means there's no file to
596 // update the registry with, which is a `LabelButNoFiles` error
597 RefDataset {
598 label: _,
599 fasta: None,
600 genbank: None,
601 gfa: None,
602 gff: None,
603 gtf: None,
604 bed: None,
605 tar: None,
606 } => return Err(EntryError::LabelButNoFiles),
607 }
608
609 // If we've made it this far, all is well; return the mutated instance of
610 // the project.
611 Ok(self)
612 }
613
614 #[inline]
615 fn get_dataset_idx(&self, label: &str) -> Option<usize> {
616 // find the index of the old dataset to be updated with new information from
617 // the user
618 let dataset_match_indices: Vec<_> = self
619 .datasets()
620 .iter()
621 .enumerate()
622 .filter(|(_i, dataset)| dataset.label == label)
623 .map(|(i, _)| i)
624 .collect();
625
626 if dataset_match_indices.is_empty() {
627 return None;
628 }
629
630 // Make sure that the above system that we *assume* will work doesn't actually break (it should never
631 // be possible to have two dataset entries with the same label).
632 assert_eq!(
633 dataset_match_indices.len(),
634 1,
635 "Invalid state slipped through the cracks when identifying which dataset should be updated with the new information for dataset '{}'. Somehow, multiple indices were returned: {:?}",
636 label,
637 &dataset_match_indices
638 );
639
640 // With that assert passing, pull out the index usize
641 Some(dataset_match_indices[0])
642 }
643
644 #[allow(clippy::similar_names)]
645 pub(crate) fn collect_downloads(
646 &self,
647 label: Option<&str>,
648 target_dir: &Path,
649 ) -> Vec<(RefDataset, Vec<UnvalidatedFile>)> {
650 let datasets = if let Some(label) = label {
651 self.clone()
652 .datasets_owned()
653 .into_iter()
654 .filter(|dataset| dataset.label == label)
655 .collect::<Vec<_>>()
656 } else {
657 self.clone()
658 .datasets_owned()
659 .into_iter()
660 .collect::<Vec<_>>()
661 };
662 assert_ne!(0, datasets.len());
663 datasets
664 .into_iter()
665 .map(|dataset| {
666 let fasta = dataset.get_fasta_download(target_dir);
667 let genbank = dataset.get_genbank_download(target_dir);
668 let gfa = dataset.get_gfa_download(target_dir);
669 let gtf = dataset.get_gtf_download(target_dir);
670 let gff = dataset.get_gff_download(target_dir);
671 let bed = dataset.get_bed_download(target_dir);
672 let tar = dataset.get_tar_download(target_dir);
673 info!(
674 "Preparing to download these files:\n{:?}",
675 [&fasta, &genbank, &gfa, &gff, >f, &bed, &tar]
676 );
677 let files = [fasta, genbank, gfa, gff, gtf, bed, tar]
678 .into_iter()
679 .flatten()
680 .collect::<Vec<_>>();
681 (dataset, files)
682 })
683 .collect::<Vec<_>>()
684 }
685
686 /// Downloads a reference dataset from a Project's registry by label, fetching any registered file
687 /// URLs into a target directory.
688 ///
689 /// This method implements the core file downloading functionality in refman. Given a dataset label
690 /// and target directory, it will:
691 /// 1. Verify the dataset exists in the registry
692 /// 2. Extract all registered file URLs for that dataset (FASTA, Genbank, GFA, GFF, GTF, BED)
693 /// 3. Launch concurrent downloads of all files into the target directory
694 /// 4. Handle any download failures or errors
695 ///
696 /// Downloads happen asynchronously and in parallel for improved performance. The method uses
697 /// tokio for async runtime and reqwest for HTTP requests. Files are downloaded maintaining
698 /// their original filenames from the URLs.
699 ///
700 /// Dataset labels must exactly match what is registered (case-sensitive). The target directory
701 /// will be created if it doesn't exist. Existing files in the target directory may be
702 /// overwritten.
703 ///
704 /// This is used to fetch reference data after registering datasets with `register()`.
705 /// For example, after registering genome data with FASTA and GFF URLs, this method would
706 /// concurrently download both files locally.
707 ///
708 /// # Arguments
709 ///
710 /// * `label` - The unique label of the dataset to download, must match what was registered
711 /// * `target_dir` - Directory path where downloaded files should be saved
712 ///
713 /// # Returns
714 ///
715 /// Returns Ok(()) if all downloads complete successfully, or an error if:
716 /// - The dataset label is not found in the registry
717 /// - Any file downloads fail
718 /// - The target directory cannot be accessed/created
719 /// - Other IO or HTTP errors occur
720 ///
721 /// # Errors
722 ///
723 /// This method can return `EntryError::LabelNotFound` if the dataset is not in the registry,
724 /// as well as various IO and HTTP errors wrapped in `anyhow::Error` for failed downloads.
725 ///
726 /// # Panics
727 ///
728 /// This method will panic if:
729 /// - The progress bar style template is invalid
730 /// - Multiple instances simultaneously write to the same shared progress output
731 /// - The download futures report an internal thread failure
732 ///
733 #[allow(clippy::too_many_lines)]
734 pub async fn download_dataset(
735 self,
736 label: Option<&str>,
737 target_dir: PathBuf,
738 ) -> color_eyre::Result<Self> {
739 // make a new reqwest http client that can be shared between threads
740 let shared_client = Client::new();
741
742 // pull in the sets of files to be downloaded
743 let dataset_files: Vec<(RefDataset, Vec<UnvalidatedFile>)> =
744 self.collect_downloads(label, &target_dir);
745
746 // count the downloads
747 let num_to_download = count_downloads(&dataset_files);
748
749 // early return if there's nothing to download
750 if num_to_download == 0 {
751 info!(
752 "All requested files were previously downloaded and still passed checksums, so no downloads will be performed."
753 );
754 return Ok(self);
755 }
756
757 // set up a progress bar based on the number
758 let (mut toplevel_pb, multiprog) = setup_progress_tracking(label, num_to_download);
759
760 // put each download into its own tokio thread, and collect its handle into a vector
761 // that can be polled downstream
762 let dataset_task_handles =
763 submit_download_requests(dataset_files, &shared_client, &target_dir, &multiprog);
764
765 let updated_datasets =
766 update_project_datasets(dataset_task_handles, &mut toplevel_pb).await?;
767
768 // Once all downloads finish, update and finish the overall progress bar.
769 toplevel_pb.finish_with_message(format!(
770 "Done! {num_to_download} files successfully downloaded to {target_dir:?}."
771 ));
772
773 // Update the project and return it
774 let updated_project = self.update_registry(&updated_datasets);
775
776 Ok(updated_project)
777 }
778
779 #[must_use]
780 pub fn update_registry(self, new_datasets: &[RefDataset]) -> Project {
781 // make a hashmap of the old datasets and new datasets we can compare for available updates
782 let old_datasets: HashMap<&str, &RefDataset> = self
783 .datasets()
784 .iter()
785 .map(|dataset| (dataset.label.as_str(), dataset))
786 .collect();
787 let updated_datasets: HashMap<&str, &RefDataset> = new_datasets
788 .iter()
789 .map(|dataset| (dataset.label.as_str(), dataset))
790 .collect();
791
792 // if a key in the old dataset is also in a new dataset, swap in the new data
793 let merged_datasets: Vec<RefDataset> = old_datasets
794 .into_iter()
795 .map(|(label, dataset)| match updated_datasets.get(label) {
796 Some(aha) => (*aha).to_owned(),
797 None => dataset.clone(),
798 })
799 .collect();
800
801 // use Rust's nice struct update syntax to create a new registry
802 let updated_registry = Registry {
803 datasets: merged_datasets,
804 last_modified: Timestamp::now(),
805 ..self.project
806 };
807
808 // return a new instance of the project in functional style
809 Self {
810 project: updated_registry,
811 }
812 }
813
814 /// Removes a dataset from the Project's registry by its label.
815 ///
816 /// This method allows removing individual datasets from a refman Project's registry
817 /// while maintaining the integrity of the remaining datasets. It can be used to:
818 /// - Remove outdated or no longer needed reference datasets
819 /// - Clean up the registry by removing temporary entries
820 /// - Manage the project's dataset collection over time
821 ///
822 /// The method enforces several rules to maintain registry integrity:
823 /// - The label must exactly match an existing dataset (case-sensitive)
824 /// - The registry must maintain at least one dataset after removal
825 /// - Only one dataset can be removed at a time
826 ///
827 /// This complements `register()` an`download_dataset()` in the lifecycle of managing
828 /// reference data. While those methods add and fetch datasets, `remove()` allows
829 /// pruning datasets that are no longer needed.
830 ///
831 /// # Arguments
832 ///
833 /// * `label` - The unique label identifying the dataset to remove from the registry
834 ///
835 /// # Returns
836 ///
837 /// Returns Ok(Project) with the updated Project if removal succeeds, or an
838 /// `EntryError` in the following cases:
839 /// - `EntryError::LabelNotFound` if no dataset matches the provided label
840 /// - `EntryError::FinalEntry` if removing this dataset would empty the registry
841 ///
842 /// The Project instance is consumed and a new instance is returned to maintain
843 /// the builder pattern used throughout the API.
844 ///
845 /// # Errors
846 ///
847 /// This method can return the following errors:
848 /// - `EntryError::LabelNotFound` if the specified label is not in the registry
849 /// - `EntryError::FinalEntry` if removing this dataset would empty the registry
850 /// entirely (at least one dataset must always remain)
851 ///
852 pub fn remove(mut self, label: &str) -> Result<Self, EntryError> {
853 // make sure the label is in the recorded datasets
854 if self
855 .datasets()
856 .iter()
857 .filter(|dataset| dataset.label == label)
858 .collect::<Vec<&RefDataset>>()
859 .is_empty()
860 {
861 return Err(EntryError::LabelNotFound(label.to_string()));
862 }
863
864 // if it is, filter it out in place
865 self.project
866 .filter_datasets(|dataset| dataset.label != label);
867
868 // return an error if that was the last entry
869 if self.datasets().is_empty() {
870 return Err(EntryError::FinalEntry(label.to_string()));
871 }
872
873 // otherwise, return the mutated project
874 Ok(self)
875 }
876
877 fn print_single_label_data(self, label: &str) {
878 let datasets = self.datasets();
879 let dataset: Vec<_> = datasets
880 .iter()
881 .filter(|dataset| dataset.label == label)
882 .collect();
883 assert_eq!(
884 dataset.len(),
885 1,
886 "No project with the label '{label}' has been registered. Run `refman list` without the label to see which datasets are registered."
887 );
888 let unwrapped_dataset = dataset[0];
889
890 eprintln!("URLs registered for {label}:");
891 eprintln!("--------------------{}", "-".repeat(label.len()));
892 eprintln!(
893 " - FASTA: {}",
894 unwrapped_dataset
895 .fasta
896 .clone()
897 .unwrap_or(DownloadStatus::default())
898 );
899 eprintln!(
900 " - Genbank: {}",
901 unwrapped_dataset
902 .genbank
903 .clone()
904 .unwrap_or(DownloadStatus::default())
905 );
906 eprintln!(
907 " - GFA: {}",
908 unwrapped_dataset
909 .gfa
910 .clone()
911 .unwrap_or(DownloadStatus::default())
912 );
913 eprintln!(
914 " - GFF: {}",
915 unwrapped_dataset
916 .gff
917 .clone()
918 .unwrap_or(DownloadStatus::default())
919 );
920 eprintln!(
921 " - GTF: {}",
922 unwrapped_dataset
923 .gtf
924 .clone()
925 .unwrap_or(DownloadStatus::default())
926 );
927 eprintln!(
928 " - BED: {}",
929 unwrapped_dataset
930 .bed
931 .clone()
932 .unwrap_or(DownloadStatus::default())
933 );
934 eprintln!(
935 " - TAR: {}",
936 unwrapped_dataset
937 .tar
938 .clone()
939 .unwrap_or(DownloadStatus::default())
940 );
941 }
942
943 fn print_all_labels(self) {
944 // print a title field if it has been set
945 let title_field = &self.project.title;
946 if let Some(title) = title_field {
947 info!("Showing available data registered for {title}:");
948 }
949
950 // make a new mutable instance of a pretty table to be appended to
951 let mut pretty_table = Table::new();
952
953 // add the title row
954 pretty_table.add_row(row![
955 "Label", "FASTA", "Genbank", "GFA", "GFF", "GTF", "BED", "TAR",
956 ]);
957
958 // add rows for each dataset
959 let datasets = self.datasets();
960 for dataset in datasets {
961 pretty_table.add_row(row![
962 dataset.label,
963 abbreviate_str(
964 dataset
965 .fasta
966 .clone()
967 .unwrap_or(DownloadStatus::default())
968 .url_owned(),
969 20,
970 8,
971 25
972 ),
973 abbreviate_str(
974 dataset
975 .genbank
976 .clone()
977 .unwrap_or(DownloadStatus::default())
978 .url_owned(),
979 20,
980 8,
981 25
982 ),
983 abbreviate_str(
984 dataset
985 .gfa
986 .clone()
987 .unwrap_or(DownloadStatus::default())
988 .url_owned(),
989 20,
990 8,
991 25
992 ),
993 abbreviate_str(
994 dataset
995 .gff
996 .clone()
997 .unwrap_or(DownloadStatus::default())
998 .url_owned(),
999 20,
1000 8,
1001 25
1002 ),
1003 abbreviate_str(
1004 dataset
1005 .gtf
1006 .clone()
1007 .unwrap_or(DownloadStatus::default())
1008 .url_owned(),
1009 20,
1010 8,
1011 25
1012 ),
1013 abbreviate_str(
1014 dataset
1015 .bed
1016 .clone()
1017 .unwrap_or(DownloadStatus::default())
1018 .url_owned(),
1019 20,
1020 8,
1021 25
1022 ),
1023 abbreviate_str(
1024 dataset
1025 .tar
1026 .clone()
1027 .unwrap_or(DownloadStatus::default())
1028 .url_owned(),
1029 20,
1030 8,
1031 25
1032 ),
1033 ]);
1034 }
1035
1036 pretty_table.printstd();
1037 }
1038
1039 /// Pretty prints the currently registered datasets in a tabular format.
1040 ///
1041 /// This method provides a human-readable view of all reference datasets currently registered
1042 /// in the Project. It prints a formatted table showing each dataset's label and any
1043 /// registered file URLs for the supported bioinformatics formats (FASTA, Genbank, GFA,
1044 /// GFF, GTF, BED).
1045 ///
1046 /// The output is formatted as a table with columns for:
1047 /// - Dataset Label
1048 /// - FASTA URL (if registered)
1049 /// - Genbank URL (if registered)
1050 /// - GFA URL (if registered)
1051 /// - GFF URL (if registered)
1052 /// - GTF URL (if registered)
1053 /// - BED URL (if registered)
1054 ///
1055 /// Empty cells indicate that no URL is registered for that file format. If the Project
1056 /// has a title set, it will be displayed above the table.
1057 ///
1058 /// This provides an easy way to:
1059 /// - View all registered datasets at once
1060 /// - Check which file formats are available for each dataset
1061 /// - Verify dataset labels and URLs
1062 /// - Share the current state of your reference data registry
1063 ///
1064 /// The method consumes self as it follows the builder pattern used throughout the API.
1065 /// The actual printing is handled through the prettytable crate for consistent formatting.
1066 ///
1067 /// # Outputs
1068 ///
1069 /// Prints a formatted table to stdout. If the Project has a title, it is printed as a
1070 /// header above the table. Empty values in the table indicate no URL is registered for
1071 /// that format.
1072 ///
1073 /// # Notes
1074 ///
1075 /// The output is meant for human consumption and formatted for readability. For
1076 /// programmatic access to dataset information, use the `datasets()` or `datasets_owned()`
1077 /// methods instead.
1078 ///
1079 /// # Panics
1080 ///
1081 /// This method will panic if:
1082 /// - Multiple datasets with the same label exist in the registry when requesting a specific label
1083 /// - A requested dataset label does not exist when filtering registered datasets
1084 /// - The prettytable crate encounters an error when printing the output table
1085 pub fn prettyprint(self, label: Option<String>) {
1086 // if the user requested a label, just print the information for that label
1087 if let Some(label_str) = label {
1088 self.print_single_label_data(&label_str);
1089 return;
1090 }
1091
1092 // otherwise, print all datasets as a table
1093 self.print_all_labels();
1094 }
1095}
1096
1097#[inline]
1098fn abbreviate_str(s: String, max_chars: usize, head_chars: usize, tail_chars: usize) -> String {
1099 // Count the characters in the string.
1100 let char_count = s.chars().count();
1101
1102 // If the string is not too long, return it unchanged.
1103 if char_count <= max_chars {
1104 return s;
1105 }
1106
1107 // Collect the first `head_chars` characters.
1108 let head: String = s.chars().take(head_chars).collect();
1109
1110 // Collect the last `tail_chars` characters.
1111 let tail: String = s
1112 .chars()
1113 .rev()
1114 .take(tail_chars)
1115 .collect::<String>()
1116 .chars()
1117 .rev()
1118 .collect();
1119
1120 format!("{head}...{tail}")
1121}
1122
1123#[derive(Debug, Serialize, Deserialize, Clone)]
1124struct Registry {
1125 title: Option<String>,
1126 description: Option<String>,
1127 last_modified: Timestamp,
1128 global: bool,
1129 datasets: Vec<RefDataset>,
1130}
1131
1132impl Default for Registry {
1133 fn default() -> Self {
1134 Registry {
1135 title: None,
1136 description: None,
1137 last_modified: Timestamp::now(),
1138 global: false,
1139 datasets: vec![],
1140 }
1141 }
1142}
1143
1144impl Registry {
1145 fn filter_datasets<F>(&mut self, predicate: F)
1146 where
1147 F: FnMut(&RefDataset) -> bool,
1148 {
1149 self.datasets.retain(predicate);
1150 }
1151}
1152
1153/// A configuration struct for customizing how refman interacts with registry files in your filesystem.
1154///
1155/// `RegistryOptions` is the primary way to control where and how refman stores its data. It provides
1156/// methods to:
1157/// - Set custom registry file locations
1158/// - Configure global vs local registry behavior
1159/// - Initialize new registry files
1160/// - Read from and write to existing registries
1161/// - Set project metadata like titles and descriptions
1162///
1163/// The struct resolves registry paths according to the following priority:
1164/// 1. User-specified custom path via `requested_path`
1165/// 2. For global registries (`global = true`):
1166/// - `$REFMAN_HOME/.refman/refman.toml` if `REFMAN_HOME` is set
1167/// - ~/.refman/refman.toml as default global location
1168/// 3. For local registries (`global = false`):
1169/// - ./refman.toml in current directory
1170///
1171/// This flexibility allows refman to support both project-specific local registries for individual
1172/// bioinformatics projects, as well as user-wide global registries for sharing reference data
1173/// between projects.
1174///
1175/// The struct maintains the resolved absolute path to the registry file, along with project
1176/// metadata and the global/local setting. It provides methods to safely initialize new registries
1177/// and read/write registry data while maintaining data integrity.
1178///
1179/// Generally you won't construct this struct directly, but rather obtain it through the Project
1180/// struct's methods which handle the configuration details automatically. However, advanced users
1181/// can use `RegistryOptions` directly for custom registry handling.
1182///
1183/// This is a core struct in refman's architecture, working closely with Project to provide the
1184/// foundational registry management capabilities that the rest of the tool builds upon.
1185pub struct RegistryOptions {
1186 resolved_path: PathBuf,
1187 title: Option<String>,
1188 description: Option<String>,
1189 global: bool,
1190}
1191
1192impl RegistryOptions {
1193 /// Creates a new `RegistryOptions` instance with customized settings for registry file handling.
1194 ///
1195 /// This struct provides granular control over how refman interacts with registry files,
1196 /// determining where they are stored and how they are initialized. It implements the core
1197 /// logic for resolving registry paths according to the following priority:
1198 ///
1199 /// 1. User-specified custom path via `requested_path` parameter
1200 /// 2. For global registries (`global = true`):
1201 /// - `$REFMAN_HOME/.refman/refman.toml` if `REFMAN_HOME` is set
1202 /// - ~/.refman/refman.toml as default global location
1203 /// 3. For local registries (`global = false`):
1204 /// - ./refman.toml in current directory
1205 ///
1206 /// The struct handles all filesystem interactions needed to:
1207 /// - Resolve and validate registry file paths
1208 /// - Create new registry files or directories as needed
1209 /// - Manage environment variables like `REFMAN_HOME`
1210 /// - Initialize registries with project metadata
1211 ///
1212 /// It works closely with the Project struct to provide the foundational registry
1213 /// management capabilities that refman builds upon. While most users will interact
1214 /// with registries through the Project API, this struct allows advanced users to
1215 /// customize registry behavior.
1216 ///
1217 /// The method performs validation to ensure the requested registry location is
1218 /// accessible and can be written to. It handles edge cases like missing directories
1219 /// and environment variables gracefully.
1220 ///
1221 /// # Arguments
1222 ///
1223 /// * `title` - Optional title for the registry/project
1224 /// * `description` - Optional description text
1225 /// * `requested_path` - Optional custom path where the registry should be stored
1226 /// * `global` - Whether this is a global (true) or local (false) registry
1227 ///
1228 /// # Returns
1229 ///
1230 /// Returns Ok(RegistryOptions) if initialization succeeds, or `RegistryError` if:
1231 /// - The requested path is invalid or inaccessible
1232 /// - Required directories cannot be created
1233 /// - Environment variables cannot be set
1234 /// - Other filesystem operations fail
1235 ///
1236 /// # Errors
1237 ///
1238 /// This method can return `RegistryError` variants for various filesystem and
1239 /// environment access failures. The error types provide context about what
1240 /// specifically failed during registry setup.
1241 pub fn try_new(
1242 title: Option<String>,
1243 description: Option<String>,
1244 requested_path: &Option<String>,
1245 global: bool,
1246 ) -> Result<RegistryOptions, RegistryError> {
1247 // If the user requested a path, see if it exists and is accessible, and
1248 // try to make it work
1249 if let Some(possible_path) = requested_path.as_deref() {
1250 let maybe_path = PathBuf::from_str(possible_path).ok();
1251 let resolved_path = resolve_registry_path(maybe_path, global)?;
1252
1253 Ok(Self {
1254 resolved_path,
1255 title,
1256 description,
1257 global,
1258 })
1259 // otherwise, resolve a path with default settings
1260 } else {
1261 let resolved_path = resolve_registry_path(None, global)?;
1262
1263 Ok(Self {
1264 resolved_path,
1265 title,
1266 description,
1267 global,
1268 })
1269 }
1270 }
1271
1272 /// Initializes a new registry file for the Project if one doesn't already exist.
1273 ///
1274 /// This method handles creating and initializing the registry file that stores a
1275 /// Project's reference datasets and metadata. The registry file location is determined
1276 /// by the `RegistryOptions` configuration, following these rules:
1277 ///
1278 /// 1. User-specified custom path if provided to `RegistryOptions::try_new()`
1279 /// 2. For global registries (global = true):
1280 /// - `$REFMAN_HOME/.refman/refman.toml` if `REFMAN_HOME` is set
1281 /// - ~/.refman/refman.toml as default global location
1282 /// 3. For local registries (global = false):
1283 /// - ./refman.toml in current directory
1284 ///
1285 /// The method will:
1286 /// - Create a new refman.toml file if one doesn't exist at the resolved path
1287 /// - Initialize it with provided title and description if specified
1288 /// - Set appropriate global/local flag
1289 /// - Create any necessary parent directories
1290 /// - Handle filesystem permissions and access
1291 ///
1292 /// If a registry file already exists at the target location, the method will
1293 /// log an informational message and take no action, preserving the existing
1294 /// registry data.
1295 ///
1296 /// This is typically called automatically when creating new Projects, but can
1297 /// be called directly for custom registry initialization workflows. The method
1298 /// integrates with refman's overall registry management system to maintain
1299 /// data integrity and consistent state.
1300 ///
1301 /// # Returns
1302 ///
1303 /// Returns Ok(()) if initialization succeeds or registry already exists.
1304 /// Returns `RegistryError` if filesystem operations fail due to permissions,
1305 /// invalid paths, or other IO errors.
1306 ///
1307 /// # Errors
1308 ///
1309 /// Can return `RegistryError` variants for:
1310 /// - Failed file creation
1311 /// - Invalid paths
1312 /// - Insufficient permissions
1313 /// - Filesystem errors
1314 pub fn init(&self) -> Result<(), RegistryError> {
1315 // If a refman.toml doesn't exist, make it and write out the available information
1316 if self.resolved_path.exists() {
1317 info!("A refman registry already exists. Start filling it with `refman register`.");
1318 } else {
1319 let mut new_project =
1320 Project::new(self.title.clone(), self.description.clone(), self.global);
1321 File::create(&self.resolved_path)?;
1322
1323 self.write_registry(&mut new_project)?;
1324 // Otherwise, do nothing except log out that a registry file already exists
1325 }
1326 Ok(())
1327 }
1328
1329 /// Reads and deserializes a registry file into a Project, or initializes a new empty Project.
1330 ///
1331 /// This method handles loading registry data from refman.toml files. It follows these rules:
1332 /// - If no registry file exists at the resolved path, returns a default empty Project
1333 /// - If an empty registry file exists, returns a default empty Project
1334 /// - Otherwise deserializes the TOML file into a Project instance
1335 ///
1336 /// The registry file path is determined by `RegistryOptions` rules, in order:
1337 /// 1. User-specified custom path if provided
1338 /// 2. For global registries (global = true):
1339 /// - `$REFMAN_HOME/.refman/refman.toml`
1340 /// - ~/.refman/refman.toml (default)
1341 /// 3. For local registries (global = false):
1342 /// - ./refman.toml
1343 ///
1344 /// This method is core to refman's persistence layer, allowing Projects to be saved and
1345 /// loaded across sessions. It works in tandem with `write_registry()` to maintain registry
1346 /// state. The registry files store:
1347 /// - Project metadata (title, description)
1348 /// - Dataset entries with labels and file URLs
1349 /// - Last modified timestamp
1350 /// - Global/local status
1351 ///
1352 /// The method handles common edge cases like:
1353 /// - Missing registry files
1354 /// - Empty registry files
1355 /// - Invalid TOML formatting
1356 /// - File access errors
1357 ///
1358 /// This is typically called internally by Project methods that need to load registry
1359 /// state, but can be used directly for custom registry reading workflows.
1360 ///
1361 /// # Returns
1362 ///
1363 /// Returns Ok(Project) containing either:
1364 /// - A deserialized Project from the registry file
1365 /// - A new empty Project if no valid registry exists
1366 ///
1367 /// # Errors
1368 ///
1369 /// Returns `RegistryError` if:
1370 /// - File operations fail (permissions, IO errors)
1371 /// - TOML deserialization fails
1372 /// - Registry path resolution fails
1373 pub fn read_registry(&self) -> Result<Project, RegistryError> {
1374 // To save some effort, first check if the refman.toml exists. If it doesn't,
1375 // just set up a project with default settings and early-return that
1376 if !self.resolved_path.exists() {
1377 let new_project = Project::default();
1378 return Ok(new_project);
1379 }
1380
1381 // Additionally, if a file exists but is empty, pretend it doesn't exist and do
1382 // the same thing as above
1383 if fs::metadata(&self.resolved_path)?.len() == 0 {
1384 let new_project = Project::default();
1385 return Ok(new_project);
1386 }
1387
1388 // If neither of those conditions were met, read and deserialize the TOML
1389 // file into a Project struct and return it
1390 let toml_contents = read_to_string(self.resolved_path.clone())?;
1391 let project: Project = toml::from_str(&toml_contents)?;
1392 Ok(project)
1393 }
1394 /// Writes a Project's registry data to the refman.toml file at the resolved registry path.
1395 ///
1396 /// This method handles persisting Project state to disk, including:
1397 /// - All registered datasets with their labels and file URLs
1398 /// - Project metadata like title and description
1399 /// - Last modified timestamp
1400 /// - Global/local registry status
1401 ///
1402 /// The registry file location follows `RegistryOptions` rules, in order:
1403 /// 1. User-specified custom path if provided
1404 /// 2. For global registries (global = true):
1405 /// - `$REFMAN_HOME/.refman/refman.toml`
1406 /// - ~/.refman/refman.toml (default)
1407 /// 3. For local registries (global = false):
1408 /// - ./refman.toml
1409 ///
1410 /// This method works in tandem with `read_registry()` to maintain persistent state
1411 /// across refman sessions. When writing, it:
1412 /// - Updates the last modified timestamp
1413 /// - Serializes the Project data to TOML format
1414 /// - Writes the TOML to the resolved registry path
1415 /// - Creates/overwrites the registry file as needed
1416 ///
1417 /// This is typically called internally by Project methods that modify state, but
1418 /// can be used directly for custom registry writing workflows. The method integrates
1419 /// with refman's overall registry management system to maintain data integrity.
1420 ///
1421 /// # Arguments
1422 ///
1423 /// * `project` - Mutable reference to the Project whose state should be written
1424 ///
1425 /// # Returns
1426 ///
1427 /// Returns Ok(()) if the write succeeds, or `RegistryError` if filesystem operations fail.
1428 ///
1429 /// # Errors
1430 ///
1431 /// Returns `RegistryError` if:
1432 /// - File operations fail (permissions, IO errors)
1433 /// - TOML serialization fails
1434 /// - Registry path resolution fails
1435 ///
1436 /// # Panics
1437 ///
1438 /// This method does not panic under normal circumstances, but may panic if the filesystem
1439 /// becomes inaccessible while writing or if memory allocation fails during serialization.
1440 pub fn write_registry(&self, project: &mut Project) -> Result<(), RegistryError> {
1441 // update the timestamp
1442 project.project.last_modified = Timestamp::now();
1443
1444 // serialize and write out the TOML file
1445 let toml_text = toml::to_string_pretty(project)?;
1446 fs::write(&self.resolved_path, toml_text)?;
1447
1448 Ok(())
1449 }
1450}
1451
1452fn resolve_registry_path(
1453 maybe_path: Option<PathBuf>,
1454 global: bool,
1455) -> Result<PathBuf, RegistryError> {
1456 // to resolve a registry path, a fair amount of control flow needs to happen to unwrap a few conditions.
1457 // First, we prioritize a directory the user requests we place the registry in, if provided. This is the simplest
1458 // branch and comes first.
1459 let registry_path = match maybe_path {
1460 Some(valid_path) => {
1461 if let Some(path_str) = valid_path.to_str() {
1462 debug!("Setting the refman home to '{path_str}'");
1463 set_refman_home(path_str);
1464 }
1465 valid_path.join("refman.toml")
1466 },
1467
1468 // If the user did not request a particular directory, we then check if a global registry was requested.
1469 // If not, this is the next simplest case; just place the registry in the current working directory (ideally,
1470 // the project root).
1471 None => {
1472 // If not global, use the current directory as the refman home and return the full path.
1473 if !global {
1474 let current_dir = current_dir()?;
1475 if let Some(current_dir_string) = current_dir.to_str() {
1476 debug!("Setting the refman home to '{current_dir_string}'");
1477 set_refman_home(current_dir_string);
1478 }
1479
1480 return Ok(current_dir.join("refman.toml"));
1481 }
1482
1483 // If no desired directory was provided, but the user also requested that the registry is global, first
1484 // check the environment variable REFMAN_HOME for the registry's location.
1485 let refman_home: Option<PathBuf> = match env::var("REFMAN_HOME") {
1486 Ok(path_str) => {
1487 debug!(
1488 "Desired file path detected in the REFMAN_HOME environment variable: '{}'. A global registry will be placed there.",
1489 path_str
1490 );
1491 let path = PathBuf::from(path_str);
1492 Some(path)
1493 },
1494 // If that environment variable isn't set, place it in the home directory.
1495 Err(_) => {
1496 debug!(
1497 "The REFMAN_HOME variable is not set. The registry will thus be placed in its default location in the user's home directory."
1498 );
1499 dirs::home_dir()
1500 },
1501 };
1502
1503 // Finally, whether the home directory is being used or the current directory as a fallback, join on
1504 // a subdirectory called ".refman" and then "refman.toml" onto that.
1505 if let Some(dir) = refman_home {
1506 let resolved_home = dir.join(".refman");
1507 debug!("setting the refman home to '{:?}'", resolved_home);
1508 resolved_home
1509 } else {
1510 warn!("unable to access home directory, so `refman `will place its registry in the current working directory. unless this path is provided in the next `refman` run, `refman` may be unable to pick up where it leaves off during the current run.");
1511 let current_dir = current_dir()?;
1512 if let Some(current_dir_string) = current_dir.to_str() {
1513 debug!("setting the refman home to '{current_dir_string}'");
1514 set_refman_home(current_dir_string);
1515 }
1516 let resolved_home = current_dir.join(".refman");
1517 debug!("setting the refman home to '{:?}'", resolved_home);
1518 resolved_home
1519 }.join("refman.toml")
1520 }, // TODO: Eventually, it would be cool to have a global dotfile config for refman so the user doesn't have
1521 // to tell it to operate globally every time.
1522 };
1523
1524 Ok(registry_path)
1525}
1526
1527fn set_refman_home(desired_dir: &str) {
1528 // If REFMAN_HOME is set,
1529 if let Ok(old_home) = env::var("REFMAN_HOME") {
1530 warn!(
1531 "The environment variable $REFMAN_HOME was previously set to {}, but a new location at {} was requested. `refman` will overwrite the old $REFMAN_HOME value and proceed.",
1532 old_home, desired_dir
1533 );
1534 unsafe { env::set_var("REFMAN_HOME", desired_dir) }
1535 } else {
1536 debug!(
1537 "The REFMAN_HOME environment variable has not previously been set. Now setting it to the requested directory, {}",
1538 desired_dir
1539 );
1540 unsafe { env::set_var("REFMAN_HOME", desired_dir) }
1541 }
1542}
1543
1544fn is_likely_url(url: &str) -> bool {
1545 url.starts_with("http") || url.starts_with("ftp") || url.starts_with("sftp")
1546}
1547
1548#[inline]
1549fn count_downloads(dataset_files: &[(RefDataset, Vec<UnvalidatedFile>)]) -> usize {
1550 // count the files to generate a message to inform the user of what will be downloaded
1551 let mut num_to_download = 0;
1552 for (_, files) in dataset_files {
1553 num_to_download += files.len();
1554 }
1555 info!("{num_to_download} downloads are confirmed. Proceeding...");
1556 num_to_download
1557}
1558
1559#[allow(clippy::expect_used)]
1560fn setup_progress_tracking(
1561 label: Option<&str>,
1562 num_to_download: usize,
1563) -> (ProgressBar, Arc<MultiProgress>) {
1564 // generate a message based on whether a particular dataset was requested as well as on the number
1565 // of files to be downloaded.
1566 let message = match label {
1567 Some(label_str) => {
1568 format!("Downloading {num_to_download} files for project labeled '{label_str}'...")
1569 },
1570 None => format!("Downloading all {num_to_download} files listed in the refman registry..."),
1571 };
1572
1573 // Create a shared MultiProgress container.
1574 let multi_pb = Arc::new(MultiProgress::new());
1575
1576 // Create a top-level progress bar with total length equal to the number of files, and set its starting message
1577 // with the message computed above
1578 let toplevel_pb = multi_pb.add(ProgressBar::new(num_to_download as u64));
1579 toplevel_pb.set_style(
1580 ProgressStyle::default_bar()
1581 .template("{msg} [{bar:40.cyan/blue}] {pos}/{len} ({eta})")
1582 .expect("Failed to set template"),
1583 );
1584 toplevel_pb.set_message(message);
1585
1586 // return a raw tuple containing the number to download, the top-level progress bar, and the per-file
1587 // progress bar
1588 (toplevel_pb, multi_pb)
1589}
1590
1591fn submit_download_requests(
1592 dataset_files: Vec<(RefDataset, Vec<UnvalidatedFile>)>,
1593 shared_client: &Client,
1594 target_dir: &Path,
1595 mp: &Arc<MultiProgress>,
1596) -> Vec<JoinHandle<Result<(RefDataset, MultiDownloadResults), ColorError>>> {
1597 // count the number of files to download
1598 let num_to_download = dataset_files.len();
1599
1600 // use that count to initialize a vector of exactly the right length, each item of which will be a deeply
1601 // nested result of vectors of results. This is because tasks will be spawned at two levels: one task per
1602 // request `RefDataset`, and all the registered files per `RefDataset`.
1603 let mut dataset_task_handles: Vec<
1604 JoinHandle<Result<(RefDataset, MultiDownloadResults), ColorError>>,
1605 > = Vec::with_capacity(num_to_download);
1606
1607 // Go through each dataset and its registered files and request them. This design can be thought of somewhat like
1608 // actors, where each dataset task supervises each file download task
1609 for (dataset, files) in dataset_files {
1610 let shared_client = shared_client.clone();
1611 let mp = mp.clone();
1612 let target_dir = Arc::new(target_dir.to_path_buf());
1613
1614 // Spawn a task per dataset
1615 let handle: JoinHandle<_> = tokio::spawn(async move {
1616 // Inside this task: spawn parallel tasks for each file
1617 let file_task_handles = files.into_iter().map(|file| {
1618 let client = shared_client.clone();
1619 let dir = target_dir.clone();
1620 let mp = mp.clone();
1621
1622 tokio::spawn(async move { request_dataset(file, client, dir, mp).await })
1623 });
1624
1625 // Await all file download tasks for this dataset
1626 let file_results = try_join_all(file_task_handles).await?;
1627
1628 Ok((dataset, file_results))
1629 });
1630
1631 // collect all the dataset "supervisor" tasks
1632 dataset_task_handles.push(handle);
1633 }
1634
1635 dataset_task_handles
1636}
1637
1638async fn update_project_datasets(
1639 dataset_task_handles: Vec<JoinHandle<Result<(RefDataset, MultiDownloadResults), ColorError>>>,
1640 toplevel_pb: &mut ProgressBar,
1641) -> color_eyre::Result<Vec<RefDataset>> {
1642 // await all tasks in all threads, collecting them into a vec after the transformations below
1643 let updated_datasets: Vec<RefDataset> = try_join_all(dataset_task_handles)
1644 .await?
1645 // no need to reference each item; consuming them is fine here
1646 .into_iter()
1647 // take each attempted download, and, if successful, increment the progress bar, and then keep the
1648 // successful unvalidated downloads in a vector
1649 .filter_map(|dataset_result| {
1650 toplevel_pb.inc(1);
1651 match dataset_result {
1652 Ok((dataset, file_results)) => {
1653 match file_results.into_iter().collect::<Result<Vec<_>, _>>() {
1654 Ok(successful_files) => Some((dataset, successful_files)),
1655 Err(msg) => {
1656 warn!("Failed to download files because of this error: {}", msg);
1657 None
1658 }
1659 }
1660 }
1661 Err(msg) => {
1662 warn!("Failed to download files because of this error: {}", msg);
1663 None
1664 }
1665 }
1666 })
1667 // now use each successful download to update its associated dataset, returning an owned updated dataset or
1668 // a validation error (the update performs validation under the hood)
1669 .map(
1670 |(mut dataset, files)| -> Result<RefDataset, ValidationError> {
1671 for file in files {
1672 dataset.update_with_download(&file)?;
1673 }
1674 Ok(dataset)
1675 },
1676 )
1677 .collect::<Result<Vec<RefDataset>, ValidationError>>()?;
1678
1679 // return the vector of updated `RefDataset` instances
1680 Ok(updated_datasets)
1681}
1682
1683#[cfg(test)]
1684mod tests {
1685 #![allow(clippy::expect_used, clippy::unwrap_used)]
1686
1687 use super::*;
1688 use tempfile::tempdir;
1689
1690 #[test]
1691 fn test_new_project() {
1692 let title = Some("Test Project".to_string());
1693 let desc = Some("A test project".to_string());
1694 let project = Project::new(title.clone(), desc.clone(), false);
1695
1696 assert_eq!(project.project.title, title);
1697 assert_eq!(project.project.description, desc);
1698 assert!(!project.project.global);
1699 assert!(project.project.datasets.is_empty());
1700 }
1701
1702 #[test]
1703 fn test_registry_options_new() {
1704 let temp_dir = tempdir().unwrap();
1705 let dir_path = temp_dir.path().to_str().unwrap();
1706
1707 let options = RegistryOptions::try_new(
1708 Some("Test Registry".to_string()),
1709 Some("Test Description".to_string()),
1710 &Some(dir_path.to_string()),
1711 false,
1712 )
1713 .unwrap();
1714
1715 assert_eq!(
1716 options.resolved_path,
1717 PathBuf::from(dir_path).join("refman.toml")
1718 );
1719 assert_eq!(options.title, Some("Test Registry".to_string()));
1720 assert_eq!(options.description, Some("Test Description".to_string()));
1721 assert!(!options.global);
1722 }
1723
1724 #[test]
1725 fn test_read_write_registry() {
1726 let temp_dir = tempdir().unwrap();
1727 let dir_path = temp_dir.path().to_str().unwrap();
1728
1729 let options =
1730 RegistryOptions::try_new(None, None, &Some(dir_path.to_string()), false).unwrap();
1731
1732 // Test writing
1733 let mut project = Project::new(None, None, false);
1734 options.write_registry(&mut project).unwrap();
1735 assert!(options.resolved_path.exists());
1736
1737 // Test reading
1738 let read_project = options.read_registry().unwrap();
1739 assert_eq!(read_project.datasets().len(), 0);
1740 }
1741}