acorn_lib/
lib.rs

1//! # 🌱 ACORN Library
2//!
3//! `acorn-lib` is a one-stop-shop for everything related to building and maintaining research activity data (RAD)-related technology, including the Accessible Content Optimization for Research Needs (ACORN) tool.
4//! The modules, structs, enums and constants found here support the ACORN CLI, which checks, analyzes, and exports research activity data into useable formats.
5//!
6use color_eyre::eyre;
7use derive_more::Display;
8use indicatif::{ProgressBar, ProgressStyle};
9use owo_colors::OwoColorize;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use reqwest::header::{HeaderMap, USER_AGENT};
13use serde::{Deserialize, Serialize};
14use serde_json::Result;
15use serde_with::skip_serializing_none;
16use std::fmt::Debug;
17use std::fs::File;
18use std::io::{copy, Cursor};
19use std::path::PathBuf;
20use std::vec;
21use tracing::{debug, error, trace, warn};
22use uriparse::URI;
23use urlencoding::encode;
24
25pub mod analyzer;
26pub mod constants;
27pub mod doctor;
28pub mod powerpoint;
29pub mod schema;
30pub mod util;
31
32use crate::util::*;
33
34/// Files to ignore
35///
36/// - `.gitignore`
37/// - `.gitkeep`
38/// - `.DS_Store`
39/// - `README.md`
40pub const IGNORE: [&str; 5] = [".gitignore", ".gitlab-ci.yml", ".gitkeep", ".DS_Store", "README.md"];
41
42/// Type for GitLab tree entry
43#[derive(Clone, Debug, Display, Serialize, Deserialize, PartialEq, PartialOrd, Eq, Ord)]
44#[serde(rename_all = "lowercase")]
45pub enum EntryType {
46    /// List of files and directories
47    ///
48    /// See <https://docs.gitlab.com/api/repositories/#list-repository-tree>
49    #[display("tree")]
50    Tree,
51    /// Base64 enoded content
52    ///
53    /// See <https://docs.gitlab.com/api/repositories/#get-a-blob-from-repository>
54    #[display("blob")]
55    Blob,
56}
57/// Git hosting repository data
58#[derive(Clone, Debug, Display, Serialize, Deserialize)]
59#[serde(tag = "provider", rename_all = "lowercase")]
60pub enum Repository {
61    /// GitHub
62    ///
63    /// See <https://docs.github.com/en/rest/reference/repos>
64    #[display("github")]
65    GitHub {
66        /// Repository URI
67        uri: String,
68    },
69    /// GitLab
70    ///
71    /// See <https://docs.gitlab.com/api/repositories/#list-repository-tree>
72    #[display("gitlab")]
73    GitLab {
74        /// Integer ID of GitLab project
75        ///
76        /// See <https://docs.gitlab.com/api/projects/#get-a-single-project> for more information
77        id: Option<u64>,
78        /// Repository URI
79        uri: String,
80    },
81}
82/// Struct for buckets configuration
83///
84/// ### Example buckets.json
85/// ```json
86/// {
87///     "buckets": [
88///         {
89///             "name": "example",
90///             "repository": {
91///                 "provider": "github",
92///                 "uri": "https://github.com/username/example"
93///             }
94///         },
95///         {
96///             "name": "example",
97///             "repository": {
98///                 "provider": "gitlab",
99///                 "id": 12345,
100///                 "uri": "https://gitlab.com/username/example"
101///             }
102///         }
103///     ]
104/// }
105/// ```
106#[derive(Clone, Debug, Serialize, Deserialize)]
107pub struct BucketsConfig {
108    /// List of buckets
109    pub buckets: Vec<Bucket>,
110}
111/// Struct for bucket data
112#[derive(Clone, Debug, Serialize, Deserialize)]
113#[serde(rename_all = "camelCase")]
114pub struct Bucket {
115    /// Bucket name
116    ///
117    /// See <https://schema.org/name>
118    pub name: String,
119    /// Bucket description
120    ///
121    /// See <https://schema.org/description>
122    pub description: Option<String>,
123    /// Code repository data of bucket
124    ///
125    /// See <https://schema.org/codeRepository>
126    #[serde(alias = "repository")]
127    pub code_repository: Repository,
128}
129/// Struct for [GitHub] tree entry
130///
131/// [GitHub]: https://docs.github.com/en/rest
132#[skip_serializing_none]
133#[derive(Clone, Debug, Serialize, Deserialize)]
134pub struct GithubTreeEntry {
135    /// Path of tree entry
136    ///
137    /// The path inside the repository. Used to get content of subdirectories.
138    pub path: String,
139    /// Mode of tree entry
140    pub mode: String,
141    /// Type of tree entry
142    #[serde(rename = "type")]
143    pub entry_type: EntryType,
144    /// SHA1 of entry
145    pub sha: String,
146    /// Size of associated data
147    /// ### Note
148    /// > Not included for "tree" type entries
149    pub size: Option<u64>,
150    /// URL of associated data API endpoint
151    ///
152    /// Basically, a combination of the API endpoint and the SHA
153    pub url: String,
154}
155/// Struct for [GitHub] tree API response
156///
157/// GitHub API endpoint for trees returns
158/// ```json
159/// {
160///   "sha": "...",
161///   "url": "<endpoint>/repos/<owner>/<repo>/git/trees/<sha>",
162///   "tree": [...],
163///   "truncated": false
164/// }
165/// ```
166/// where `"tree"` is a list of [GithubTreeEntry].
167///
168/// ### Example Endpoint
169/// > `https://api.github.com/repos/jhwohlgemuth/pwsh-prelude/git/trees/master?recursive=1`
170///
171/// See [documentation] for more information
172///
173/// [GitHub]: https://docs.github.com/en/rest
174/// [documentation]: https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree
175#[derive(Clone, Debug, Serialize, Deserialize)]
176pub struct GithubTreeResponse {
177    /// SHA1 of tree
178    pub sha: String,
179    /// URL of associated data API endpoint
180    pub url: String,
181    /// List of [GithubTreeEntry]
182    pub tree: Vec<GithubTreeEntry>,
183    /// Whether tree is truncated
184    pub truncated: bool,
185}
186/// Struct for GitLab tree entry
187///
188/// See <https://docs.gitlab.com/api/repositories/#list-repository-tree>
189#[derive(Clone, Debug, Serialize, Deserialize)]
190pub struct GitlabTreeEntry {
191    /// Integer ID of GitLab project
192    ///
193    /// See <https://docs.gitlab.com/api/projects/#get-a-single-project> for more information
194    pub id: String,
195    /// Name of tree entry
196    pub name: String,
197    /// Type of tree entry
198    #[serde(rename = "type")]
199    pub entry_type: EntryType,
200    /// Path of tree entry
201    ///
202    /// The path inside the repository. Used to get content of subdirectories.
203    pub path: String,
204    /// Mode of tree entry
205    pub mode: String,
206}
207/// Struct for release data from GitLab or GitHub
208#[derive(Clone, Debug, Serialize, Deserialize)]
209pub struct Release {
210    /// Name of release
211    pub name: String,
212    /// Tag name of release
213    /// ### Example
214    /// > `v1.0.0`
215    pub tag_name: String,
216    /// Prose description of release
217    #[serde(alias = "body")]
218    pub description: String,
219    /// Date of release creation
220    pub created_at: String,
221    /// Date of release publication
222    #[serde(alias = "published_at")]
223    pub released_at: String,
224}
225impl Bucket {
226    /// Parse GitHub tree entries
227    fn parse_github_response(response: reqwest::blocking::Response) -> Vec<String> {
228        let content = response.text().unwrap();
229        let data: Result<GithubTreeResponse> = serde_json::from_str(&content);
230        match data {
231            | Ok(GithubTreeResponse { tree, .. }) => {
232                debug!("=> {} {} Tree entries", Label::found(), tree.len());
233                tree.into_iter().filter(GithubTreeEntry::is_blob).map(GithubTreeEntry::path).collect()
234            }
235            | Err(why) => {
236                error!("=> {} Process tree entries - {why}", Label::fail());
237                vec![]
238            }
239        }
240    }
241    /// Parse GitLab tree entries
242    fn parse_gitlab_response(response: reqwest::blocking::Response) -> Vec<String> {
243        let content = response.text().unwrap();
244        let data: Result<Vec<GitlabTreeEntry>> = serde_json::from_str(&content);
245        debug!("=> {} {} Tree entries", Label::found(), data.as_ref().unwrap().len());
246        match data {
247            | Ok(entries) => entries.into_iter().filter(GitlabTreeEntry::is_blob).map(GitlabTreeEntry::path).collect(),
248            | Err(why) => {
249                error!("=> {} Process tree entries - {why}", Label::fail());
250                vec![]
251            }
252        }
253    }
254    /// Get hosting domain from bucket struct
255    fn domain(&self) -> String {
256        match &self.code_repository {
257            | Repository::GitHub { uri } => match URI::try_from(uri.as_str()) {
258                | Ok(uri) => uri.host().unwrap().to_string(),
259                | Err(_) => "github.com".to_string(),
260            },
261            | Repository::GitLab { uri, .. } => match URI::try_from(uri.as_str()) {
262                | Ok(uri) => uri.host().unwrap().to_string(),
263                | Err(_) => "gitlab.com".to_string(),
264            },
265        }
266    }
267    fn tree(&self, directory: &str, page: Option<u32>) -> eyre::Result<reqwest::blocking::Response, reqwest::Error> {
268        let url = self.tree_url(directory, page);
269        let client = Client::new();
270        client.get(url.unwrap_or_default()).header(USER_AGENT, "rust-web-api-client").send()
271    }
272    fn tree_url(&self, directory: &str, page: Option<u32>) -> Option<String> {
273        match &self.code_repository {
274            | Repository::GitHub { uri } => {
275                let parsed = match URI::try_from(uri.as_str()) {
276                    | Ok(value) => value,
277                    | Err(why) => {
278                        warn!(uri, "=> {} Parse GitHub URI - {why}", Label::fail());
279                        return None;
280                    }
281                };
282                let path = parsed.path();
283                let url = format!("https://api.{}/repos{}/git/trees/main?recursive=1", self.domain(), path);
284                debug!(url = url.as_str(), "=> {}", Label::using());
285                Some(url)
286            }
287            | Repository::GitLab { .. } => {
288                if let Some(id) = &self.code_repository.id() {
289                    let per_page = 100;
290                    let url = format!(
291                        "https://{}/api/v4/projects/{}/repository/tree?&per_page={}&page={}&recursive=true&path={}",
292                        self.domain(),
293                        id,
294                        per_page,
295                        page.unwrap_or_default(),
296                        directory
297                    );
298                    debug!(url = url.as_str(), "=> {}", Label::using());
299                    Some(url)
300                } else {
301                    None
302                }
303            }
304        }
305    }
306    /// Download files from bucket to local directory
307    ///
308    /// Ignores files listed in [`IGNORE`]
309    pub fn download_files(self: Bucket, output: PathBuf) -> usize {
310        fn count_json_files(paths: Vec<String>) -> usize {
311            paths.clone().into_iter().filter(|path| path.to_lowercase().ends_with(".json")).count()
312        }
313        fn count_image_files(paths: Vec<String>) -> usize {
314            paths.into_iter().filter(has_image_extension).count()
315        }
316        fn download_complete_message(name: String, json_count: usize, image_count: usize) -> String {
317            let total = json_count + image_count;
318            let message = if json_count != image_count {
319                let recommendation = if json_count > image_count {
320                    "Do you need to add some images?"
321                } else {
322                    "Do you need to add some JSON files?"
323                };
324                format!(
325                    " ({} data file{}, {} image{} - {})",
326                    json_count.yellow(),
327                    suffix(json_count),
328                    image_count.yellow(),
329                    suffix(image_count),
330                    recommendation.italic(),
331                )
332            } else {
333                "".to_string()
334            };
335            format!(
336                "  {}Downloaded {} {} file{}{}",
337                if total > 0 { Label::CHECKMARK } else { Label::CAUTION },
338                if total > 0 {
339                    total.green().to_string()
340                } else {
341                    total.yellow().to_string()
342                },
343                name.to_uppercase(),
344                suffix(total),
345                message,
346            )
347        }
348        fn has_image_extension(path: &String) -> bool {
349            path.to_lowercase().ends_with(".png") || path.to_lowercase().ends_with(".jpg")
350        }
351        let paths = self
352            .clone()
353            .file_paths("")
354            .into_iter()
355            .filter(|path| !IGNORE.iter().any(|x| path.ends_with(x)))
356            .collect::<Vec<String>>();
357        let total_data: usize = count_json_files(paths.clone());
358        let total_images: usize = count_image_files(paths.clone());
359        let message = download_complete_message(self.name, total_data, total_images);
360        let progress = ProgressBar::new(paths.len() as u64);
361        let client = Client::new();
362        paths.par_iter().for_each(|path| {
363            progress.set_style(ProgressStyle::with_template(Label::PROGRESS_BAR_TEMPLATE).unwrap());
364            progress.set_message(format!("Downloading {path}"));
365            let folder = format!("{}/{}", output.display(), parent(path.clone()).display());
366            std::fs::create_dir_all(folder.clone()).unwrap();
367            if let Ok(mut file) = File::create(format!("{}/{}", output.display(), path)) {
368                if let Some(url) = self.code_repository.raw_url(path.to_string()) {
369                    match client.get(url).header(USER_AGENT, "rust-web-api-client").send() {
370                        | Ok(response) => match response.bytes() {
371                            | Ok(bytes) => {
372                                let mut content = Cursor::new(bytes);
373                                let _ = copy(&mut content, &mut file);
374                            }
375                            | Err(why) => {
376                                error!(path, "=> {} Convert to bytes - {why}", Label::fail());
377                            }
378                        },
379                        | Err(why) => {
380                            error!(path, "=> {} Download file - {why}", Label::fail());
381                        }
382                    }
383                }
384            };
385            progress.inc(1);
386        });
387        progress.set_style(ProgressStyle::with_template("{msg}").unwrap());
388        progress.finish_with_message(message);
389        total_data + total_images
390    }
391    fn file_paths(self: Bucket, directory: &str) -> Vec<String> {
392        const FIRST_PAGE: Option<u32> = Some(1);
393        fn page_count(response: &reqwest::blocking::Response) -> u32 {
394            fn parse_header(headers: &HeaderMap, key: &str) -> u32 {
395                match headers.get(key) {
396                    | Some(val) if !val.is_empty() => {
397                        let value = val.to_str().unwrap().parse::<u32>().unwrap();
398                        debug!("=> {} {} = {}", Label::using(), key, value);
399                        value
400                    }
401                    | Some(_) | None => 0,
402                }
403            }
404            let headers = response.headers();
405            parse_header(headers, "x-total-pages")
406        }
407        match self.code_repository {
408            | Repository::GitHub { .. } => match self.tree(directory, None) {
409                | Ok(response) if response.status().is_success() => Bucket::parse_github_response(response),
410                | Ok(_) | Err(_) => {
411                    let url = self.tree_url(directory, None);
412                    debug!(url, "=> {}", Label::using());
413                    error!("=> {} Get file paths for {} bucket", Label::fail(), self.name.to_uppercase().red());
414                    vec![]
415                }
416            },
417            | Repository::GitLab { .. } => match self.tree(directory, FIRST_PAGE) {
418                | Ok(response) if response.status().is_success() => {
419                    let paths = (FIRST_PAGE.unwrap_or_default()..=page_count(&response))
420                        .into_par_iter()
421                        .map(|page| self.clone().file_paths_for_page(directory, Some(page)))
422                        .reduce(std::vec::Vec::new, |a, b| [a, b].concat());
423                    trace!("{:#?}", response);
424                    paths
425                }
426                | Ok(_) | Err(_) => {
427                    let url = self.tree_url(directory, FIRST_PAGE);
428                    debug!(url, "=> {}", Label::using());
429                    error!("=> {} Get file paths for {} bucket", Label::fail(), self.name.to_uppercase().red());
430                    vec![]
431                }
432            },
433        }
434    }
435    fn file_paths_for_page(self: Bucket, directory: &str, page: Option<u32>) -> Vec<String> {
436        match self.tree(directory, page) {
437            | Ok(response) if response.status().is_success() => match self.tree(directory, page) {
438                | Ok(response) if response.status().is_success() => Bucket::parse_gitlab_response(response),
439                | Ok(_) | Err(_) => {
440                    let url = self.tree_url(directory, Some(1));
441                    error!(url, page, "=> {} Failed to get paths", Label::fail());
442                    vec![]
443                }
444            },
445            | Ok(_) | Err(_) => {
446                let url = self.tree_url(directory, page);
447                error!(url, page, "=> {} Failed to get paths", Label::fail());
448                vec![]
449            }
450        }
451    }
452}
453impl BucketsConfig {
454    /// Read and parse buckets configuration file (JSON or YAML)
455    pub fn read(path: PathBuf) -> Option<BucketsConfig> {
456        let content = match MimeType::from_path(path.clone()) {
457            | MimeType::Json => match BucketsConfig::read_json(path.clone()) {
458                | Ok(value) => Some(value),
459                | Err(_) => None,
460            },
461            | MimeType::Yaml => match BucketsConfig::read_yaml(path.clone()) {
462                | Ok(value) => Some(value),
463                | Err(_) => None,
464            },
465            | _ => unimplemented!("Unsupported configuration file extension"),
466        };
467        if let Some(content) = content {
468            Some(content)
469        } else {
470            error!(path = path.to_str().unwrap(), "=> {} Import configuration", Label::fail());
471            std::process::exit(exitcode::UNAVAILABLE);
472        }
473    }
474    /// Read buckets configuration (e.g., `buckets.json`) using Serde and [`BucketsConfig`] struct
475    fn read_json(path: PathBuf) -> Result<BucketsConfig> {
476        let content = match read_file(path.clone()) {
477            | Ok(value) if !value.is_empty() => value,
478            | Ok(_) | Err(_) => {
479                error!(
480                    path = path.to_str().unwrap(),
481                    "=> {} Bucket configuration content is not valid",
482                    Label::fail()
483                );
484                "{}".to_owned()
485            }
486        };
487        let data: Result<BucketsConfig> = serde_json::from_str(&content);
488        let label = match data {
489            | Ok(_) => Label::using(),
490            | Err(_) => Label::invalid(),
491        };
492        trace!("=> {} Bucket configuration = {:#?}", label, data.dimmed());
493        data
494    }
495    /// Read buckets configuration (e.g., `buckets.yaml`) using Serde and [`BucketsConfig`] struct
496    fn read_yaml(path: PathBuf) -> serde_yml::Result<BucketsConfig> {
497        let content = match read_file(path.clone()) {
498            | Ok(value) => value,
499            | Err(_) => {
500                error!(
501                    path = path.to_str().unwrap(),
502                    "=> {} Bucket configuration content is not valid",
503                    Label::fail()
504                );
505                "".to_owned()
506            }
507        };
508        let data: serde_yml::Result<BucketsConfig> = serde_yml::from_str(&content);
509        let label = match data {
510            | Ok(_) => Label::output(),
511            | Err(_) => Label::fail(),
512        };
513        debug!("=> {} Bucket configuration = {:#?}", label, data.dimmed());
514        data
515    }
516}
517impl GithubTreeEntry {
518    fn path(self) -> String {
519        self.path
520    }
521    fn is_blob(&self) -> bool {
522        self.entry_type.eq(&EntryType::Blob)
523    }
524}
525impl GitlabTreeEntry {
526    fn path(self) -> String {
527        self.path
528    }
529    fn is_blob(&self) -> bool {
530        self.entry_type.eq(&EntryType::Blob)
531    }
532}
533impl Repository {
534    /// Get metadata for latest release of a Gitlab or GitHub repository
535    pub fn latest_release(self) -> Option<Release> {
536        match self.releases() {
537            | releases if releases.is_empty() => None,
538            | releases => {
539                let release = releases[0].clone();
540                trace!("=> {} Latest {:#?}", Label::using(), release);
541                Some(release)
542            }
543        }
544    }
545    fn id(&self) -> Option<String> {
546        match self {
547            | Repository::GitHub { .. } => None,
548            | Repository::GitLab { id, uri } => match URI::try_from(uri.as_str()) {
549                | Ok(value) => {
550                    let mut path = value.path().to_string();
551                    path.remove(0);
552                    let encoded = encode(&path).to_string();
553                    trace!(encoded, "=> {} ID", Label::using());
554                    Some(encoded)
555                }
556                | Err(why) => {
557                    warn!(uri, "=> {} Parse GitLab URI - {why}", Label::fail());
558                    match id {
559                        | Some(value) => Some(value.to_string()),
560                        | None => None,
561                    }
562                }
563            },
564        }
565    }
566    fn releases(self) -> Vec<Release> {
567        let maybe_url = match &self {
568            | Repository::GitHub { uri } => match URI::try_from(uri.as_str()) {
569                | Ok(uri) => {
570                    let host = uri.host().unwrap().to_string();
571                    let path = uri.path();
572                    let endpoint = Some(format!("https://api.{host}/repos{path}/releases"));
573                    println!("{endpoint:#?}");
574                    endpoint
575                }
576                | Err(_) => {
577                    error!(uri, "=> {} Parse GitHub URI", Label::fail());
578                    None
579                }
580            },
581            | Repository::GitLab { uri, .. } => match self.id() {
582                | Some(id) => match URI::try_from(uri.as_str()) {
583                    | Ok(uri) => {
584                        let host = uri.host().unwrap().to_string();
585                        Some(format!("https://{host}/api/v4/projects/{id}/releases"))
586                    }
587                    | Err(why) => {
588                        error!(uri, "=> {} Parse GitLab URI - {why}", Label::fail());
589                        None
590                    }
591                },
592                | None => None,
593            },
594        };
595        if let Some(url) = maybe_url {
596            debug!(url, "=> {}", Label::using());
597            let client = Client::new();
598            match client.get(url).header(USER_AGENT, "rust-web-api-client").send() {
599                | Ok(response) => match response.text() {
600                    | Ok(text) => {
601                        let releases: Vec<Release> = match serde_json::from_str(&text) {
602                            | Ok(values) => values,
603                            | Err(why) => {
604                                error!("=> {} Parse {} API JSON response - {why}", self, Label::fail());
605                                vec![]
606                            }
607                        };
608                        releases
609                    }
610                    | Err(why) => {
611                        error!("=> {} Parse {} API text response - {why}", self, Label::fail());
612                        vec![]
613                    }
614                },
615                | Err(why) => {
616                    error!("=> {} Download {} releases - {why}", self, Label::fail());
617                    vec![]
618                }
619            }
620        } else {
621            vec![]
622        }
623    }
624    /// Get URL for raw data of a file at a given path
625    fn raw_url(&self, path: String) -> Option<String> {
626        match self {
627            | Repository::GitHub { uri, .. } => match URI::try_from(uri.clone().as_str()) {
628                | Ok(ref value) => Some(format!("https://raw.githubusercontent.com{}/refs/heads/main/{path}", value.path())),
629                | Err(why) => {
630                    error!(uri, "=> {} Parse GitHub URI - {why}", Label::fail());
631                    None
632                }
633            },
634            | Repository::GitLab { uri, .. } => Some(format!("{uri}/-/raw/main/{path}")),
635        }
636    }
637}
638
639#[cfg(test)]
640mod tests;