use async_trait::async_trait;
use exn::Exn;
use mime::Mime;
use reqwest::Client;
use url::Url;
use std::{any::Any, path::Path, sync::Arc};
use digest::Digest;
const ROOT: &str = "__ROOT__";
#[derive(Debug, Clone)]
pub struct CrawlPath(String);
impl std::fmt::Display for CrawlPath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl AsRef<Path> for CrawlPath {
fn as_ref(&self) -> &Path {
Path::new(&self.0)
}
}
impl CrawlPath {
#[must_use]
pub fn join(&self, p: &str) -> CrawlPath {
let mut new_path = self.0.clone();
if !new_path.ends_with('/') {
new_path.push('/');
}
new_path.push_str(p);
CrawlPath(new_path)
}
#[must_use]
pub fn as_str(&self) -> &str {
&self.0
}
#[must_use]
pub fn root() -> CrawlPath {
CrawlPath(ROOT.to_string())
}
#[must_use]
pub fn is_absolute(&self) -> bool {
self.0.starts_with(ROOT)
}
#[must_use]
pub fn relative(&self) -> CrawlPath {
if !self.is_absolute() {
return self.clone();
}
let rest = self
.0
.strip_prefix(ROOT)
.expect("absolute paths start with ROOT");
let rest = rest.strip_prefix('/').unwrap_or(rest);
CrawlPath(rest.to_string())
}
}
pub enum Hasher {
Md5(md5::Md5),
Sha256(sha2::Sha256),
Sha1(sha1::Sha1),
}
impl Hasher {
pub fn update(&mut self, data: &[u8]) {
match self {
Hasher::Md5(h) => h.update(data),
Hasher::Sha256(h) => h.update(data),
Hasher::Sha1(h) => h.update(data),
}
}
#[must_use]
pub fn finalize(self) -> Vec<u8> {
match self {
Hasher::Md5(h) => h.finalize().to_vec(),
Hasher::Sha256(h) => h.finalize().to_vec(),
Hasher::Sha1(h) => h.finalize().to_vec(),
}
}
}
#[allow(clippy::large_enum_variant)]
#[derive(Debug)]
pub enum Entry {
Dir(DirMeta),
File(FileMeta),
}
#[derive(Debug, Clone)]
pub struct DirMeta {
path: CrawlPath,
root_url: Url,
api_url: Url,
}
impl std::fmt::Display for DirMeta {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"DirMeta (at: {}, src: {}, src_root: {})",
self.path,
self.api_url.as_str(),
self.root_url.as_str(),
)
}
}
impl DirMeta {
#[must_use]
pub fn new(path: CrawlPath, api_url: Url, root_url: Url) -> Self {
DirMeta {
path,
root_url,
api_url,
}
}
#[must_use]
pub fn new_root(api_url: &Url) -> Self {
DirMeta {
path: CrawlPath(ROOT.to_string()),
api_url: api_url.clone(),
root_url: api_url.clone(),
}
}
#[must_use]
pub fn path(&self) -> CrawlPath {
self.path.clone()
}
#[must_use]
pub fn root_url(&self) -> Url {
self.root_url.clone()
}
#[must_use]
pub fn api_url(&self) -> Url {
self.api_url.clone()
}
#[must_use]
pub fn relative(&self) -> CrawlPath {
self.path.relative()
}
#[must_use]
pub fn join(&self, p: &str) -> CrawlPath {
self.path.join(p)
}
}
#[derive(Debug, Clone)]
pub struct Endpoint {
pub parent_url: Url,
pub key: Option<String>,
}
impl std::fmt::Display for Endpoint {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Endpoint (parent_url: {}, key: {})",
self.parent_url.as_str(),
self.key.clone().unwrap_or("<Null>".to_string())
)
}
}
#[derive(Debug)]
pub struct FileMeta {
filename: Option<String>,
file_identifier: Option<String>,
path: CrawlPath,
endpoint: Endpoint,
download_url: Url,
size: Option<u64>,
checksum: Vec<Checksum>,
mimetype: Option<Mime>,
version: Option<String>,
creation_date: Option<String>,
last_modification_date: Option<String>,
downloadable: bool,
}
impl FileMeta {
pub fn filename(&self) -> Option<&str> {
self.filename.as_deref()
}
pub fn file_identifier(&self) -> Option<&str> {
self.file_identifier.as_deref()
}
pub fn is_downloadable(&self) -> bool {
self.downloadable
}
pub fn path(&self) -> CrawlPath {
self.path.clone()
}
pub fn download_url(&self) -> Url {
self.download_url.clone()
}
pub fn checksum(&self) -> &[Checksum] {
&self.checksum
}
pub fn size(&self) -> Option<u64> {
self.size
}
pub fn version(&self) -> Option<&str> {
self.version.as_deref()
}
pub fn mimetype(&self) -> Option<Mime> {
self.mimetype.clone()
}
pub fn creation_date(&self) -> Option<&str> {
self.creation_date.as_deref()
}
pub fn last_modification_date(&self) -> Option<&str> {
self.last_modification_date.as_deref()
}
}
impl std::fmt::Display for FileMeta {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let size_str = self
.size
.map_or("<unknown>".to_string(), |s| format!("{s} bytes"));
let checksum_str = if self.checksum.is_empty() {
"<none>".to_string()
} else {
self.checksum
.iter()
.map(|c| format!("{c}"))
.collect::<Vec<_>>()
.join(", ")
};
let mimetype_str = self
.mimetype
.as_ref()
.map_or("<unknown>".to_string(), std::string::ToString::to_string);
writeln!(f, "📄 FileMeta:")?;
writeln!(f, " Path : {}", self.path)?;
writeln!(f, " Endpoint : {}", self.endpoint)?;
writeln!(f, " Download : {}", self.download_url)?;
writeln!(f, " Size : {size_str}")?;
writeln!(f, " Mime Type : {mimetype_str}")?;
writeln!(f, " Checksums : {checksum_str}")?;
Ok(())
}
}
impl FileMeta {
#[allow(clippy::too_many_arguments)]
#[must_use]
pub fn new(
filename: Option<String>,
file_identifier: Option<String>,
path: CrawlPath,
endpoint: Endpoint,
download_url: Url,
size: Option<u64>,
checksum: Vec<Checksum>,
mimetype: Option<Mime>,
version: Option<String>,
creation_date: Option<String>,
last_modification_date: Option<String>,
downloadable: bool,
) -> Self {
FileMeta {
filename,
file_identifier,
path,
endpoint,
download_url,
size,
checksum,
mimetype,
version,
creation_date,
last_modification_date,
downloadable,
}
}
#[must_use]
pub fn relative(&self) -> CrawlPath {
self.path.relative()
}
#[must_use]
pub fn endpoint(&self) -> Endpoint {
self.endpoint.clone()
}
}
#[derive(Debug, Clone)]
pub enum Checksum {
Md5(String),
Sha256(String),
Sha1(String),
}
impl std::fmt::Display for Checksum {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Checksum::Md5(h) => write!(f, "(md5: {h})"),
Checksum::Sha256(h) => write!(f, "(sha256: {h})"),
Checksum::Sha1(h) => write!(f, "(sha1: {h})"),
}
}
}
#[derive(Debug)]
pub struct RepoError {
pub message: String,
}
impl std::fmt::Display for RepoError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "repo fail: {}", self.message)
}
}
impl std::error::Error for RepoError {}
#[async_trait]
pub trait DatasetBackend: Send + Sync + Any {
async fn list(&self, client: &Client, dir: DirMeta) -> Result<Vec<Entry>, Exn<RepoError>>;
fn root_url(&self) -> Url;
fn as_any(&self) -> &dyn Any;
}
#[derive(Clone)]
pub struct Dataset {
pub backend: Arc<dyn DatasetBackend>,
}
impl Dataset {
#[must_use]
pub fn new(backend: impl DatasetBackend) -> Self {
Dataset {
backend: Arc::new(backend),
}
}
#[must_use]
pub fn root_dir(&self) -> DirMeta {
DirMeta::new_root(&self.backend.root_url())
}
}