datahugger 0.5.6

Tool for fetching data and metadata from DOI or URL.
Documentation
#![allow(clippy::upper_case_acronyms)]

use async_trait::async_trait;
use exn::{Exn, ResultExt};
use serde_json::Value as JsonValue;
use url::Url;

use reqwest::{Client, StatusCode};
use std::{any::Any, str::FromStr};

use crate::helper::json_extract;
use crate::{
    repo::{Endpoint, FileMeta, RepoError},
    Checksum, DatasetBackend, DirMeta, Entry,
};

// https://osf.io/
// API root url at https://api.osf.io/v2/nodes/
#[derive(Debug)]
pub struct OSF {
    pub id: String,
}

impl OSF {
    #[must_use]
    pub fn new(id: impl Into<String>) -> Self {
        OSF { id: id.into() }
    }
}

#[async_trait]
impl DatasetBackend for OSF {
    fn root_url(&self) -> Url {
        // https://api.osf.io/v2/nodes/<id>/files to start for every dateset entry

        // Safe to unwrap:
        // - the base URL is a hard-coded, valid absolute URL
        // - `path_segments_mut` cannot fail for this URL scheme
        let mut url = Url::from_str("https://api.osf.io/v2/nodes/").unwrap();
        url.path_segments_mut().unwrap().extend([&self.id, "files"]);
        url
    }

    async fn list(&self, client: &Client, dir: DirMeta) -> Result<Vec<Entry>, Exn<RepoError>> {
        let resp = client
            .get(dir.api_url())
            .send()
            .await
            .or_raise(|| RepoError {
                message: format!("fail at client sent GET {}", dir.api_url()),
            })?;
        let resp = resp.error_for_status().map_err(|err| match err.status() {
            Some(StatusCode::NOT_FOUND) => RepoError {
                message: format!("resource not found when GET {}", dir.api_url()),
            },
            Some(status_code) => RepoError {
                message: format!(
                    "fail GET {}, with state code: {}",
                    dir.api_url(),
                    status_code.as_str()
                ),
            },
            None => RepoError {
                message: format!("fail GET {}, network / protocol error", dir.api_url(),),
            },
        })?;
        let resp: JsonValue = resp.json().await.or_raise(|| RepoError {
            message: format!("fail GET {}, unable to convert to json", dir.api_url(),),
        })?;
        let files = resp
            .get("data")
            .and_then(JsonValue::as_array)
            .ok_or_else(|| RepoError {
                message: "field with key 'data' not resolve to an json array".to_string(),
            })?;

        let mut entries = Vec::with_capacity(files.len());
        for (idx, filej) in files.iter().enumerate() {
            let endpoint = Endpoint {
                parent_url: dir.api_url(),
                key: Some(format!("data.{idx}")),
            };
            let name: String = json_extract(filej, "attributes.name").or_raise(|| RepoError {
                message: "fail to extracting 'attributes.name' as String from json".to_string(),
            })?;
            let kind: String = json_extract(filej, "attributes.kind").or_raise(|| RepoError {
                message: "fail to extracting 'attributes.kind' as String from json".to_string(),
            })?;
            let guess = mime_guess::from_path(&name);
            match kind.as_ref() {
                "file" => {
                    let size: u64 =
                        json_extract(filej, "attributes.size").or_raise(|| RepoError {
                            message: "fail to extracting 'attributes.size' as u64 from json"
                                .to_string(),
                        })?;
                    let download_url: String =
                        json_extract(filej, "links.download").or_raise(|| RepoError {
                            message: "fail to extracting 'links.download' as String from json"
                                .to_string(),
                        })?;
                    let download_url = Url::from_str(&download_url).or_raise(|| RepoError {
                        message: format!("cannot parse '{download_url}' download url"),
                    })?;
                    let hash: String = json_extract(filej, "attributes.extra.hashes.sha256")
                        .or_raise(|| RepoError {
                            message: "fail to extracting 'attributes.extra.hashes.sha256' as String from json"
                                .to_string(),
                        })?;
                    let checksum = Checksum::Sha256(hash);
                    let file = FileMeta::new(
                        None,
                        None,
                        dir.join(&name),
                        endpoint,
                        download_url,
                        Some(size),
                        vec![checksum],
                        guess.first(),
                        None,
                        None,
                        None,
                        true,
                    );
                    entries.push(Entry::File(file));
                }
                "folder" => {
                    let api_url: String =
                        json_extract(filej, "relationships.files.links.related.href")
                        .or_raise(|| RepoError {
                            message: "fail to extracting 'relationships.files.links.related.href' as String from json"
                                .to_string(),
                        })?;
                    let api_url = Url::from_str(&api_url).or_raise(|| RepoError {
                        message: format!("cannot parse '{api_url}' api url"),
                    })?;
                    let dir = DirMeta::new(dir.join(&name), api_url, dir.root_url());
                    entries.push(Entry::Dir(dir));
                }
                typ => {
                    exn::bail!(RepoError {
                        message: format!(
                            "kind can be 'dataset' or 'kind' for an OSF entry, got {typ}"
                        )
                    });
                }
            }
        }

        Ok(entries)
    }

    fn as_any(&self) -> &dyn Any {
        self
    }
}