datahugger 0.5.5

use async_trait::async_trait;
use exn::Exn;
use mime::Mime;
use reqwest::Client;
use url::Url;

use std::{any::Any, path::Path, sync::Arc};

use digest::Digest;

const ROOT: &str = "__ROOT__";

/// A logical crawl path used to track the current location during repository crawling.
///
/// `CrawlPath` is a lightweight, owned wrapper around `String` that represents
/// URL-like, slash-separated paths. It intentionally does **not** use filesystem
/// semantics (`PathBuf`), as crawl paths follow logical repository structure rather
/// than OS-specific path rules.
///
/// Paths may be *absolute* (prefixed with a special root marker) or *relative*.
/// The root marker is an internal invariant and is stripped when converting to a
/// relative path.
///
/// This type is always owned to make it safe and ergonomic to use across asynchronous
/// tasks and threads.
///
/// # Invariants
///
/// ```
/// const ROOT: &str = "__ROOT__";
/// ```
///
/// - Absolute paths always start with `ROOT`.
/// - Relative paths never start with `ROOT`.
/// - Path separators are forward slashes (`'/'`).
///
/// # Examples
///
/// ```
/// use datahugger::CrawlPath;
///
/// let root = CrawlPath::root();
/// let p = root.join("dir").join("file.txt");
///
/// assert!(p.is_absolute());
/// assert_eq!(p.relative().as_ref(), std::path::Path::new("dir/file.txt"));
/// ```
#[derive(Debug, Clone)]
pub struct CrawlPath(String);

impl std::fmt::Display for CrawlPath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
}

impl AsRef<Path> for CrawlPath {
    fn as_ref(&self) -> &Path {
        Path::new(&self.0)
    }
}

impl CrawlPath {
    /// Appends a path segment to this crawl path, returning a new `CrawlPath`.
    ///
    /// The segment is joined using a forward slash (`'/'`). This method does not
    /// perform normalization and assumes `p` does not contain leading slashes.
    #[must_use]
    pub fn join(&self, p: &str) -> CrawlPath {
        let mut new_path = self.0.clone();
        if !new_path.ends_with('/') {
            new_path.push('/');
        }
        new_path.push_str(p);
        CrawlPath(new_path)
    }

    /// convert to &str
    #[must_use]
    pub fn as_str(&self) -> &str {
        &self.0
    }

    /// Returns the root crawl path.
    ///
    /// The root path is represented internally using a special marker and is
    /// considered absolute.
    #[must_use]
    pub fn root() -> CrawlPath {
        CrawlPath(ROOT.to_string())
    }

    /// Returns `true` if this path is absolute (i.e. starts from the crawl root).
    #[must_use]
    pub fn is_absolute(&self) -> bool {
        self.0.starts_with(ROOT)
    }

    /// Converts this path into a relative crawl path.
    ///
    /// If the path is absolute, the root marker (and an optional following slash)
    /// is stripped. If the path is already relative, it is returned unchanged.
    ///
    /// An absolute root path (`ROOT` or `ROOT/`) is converted into an empty
    /// relative path.
    ///
    /// # Panics
    ///
    /// Panics if this path is marked as absolute but does not start with `ROOT`.
    /// It indicates a violation of the internal `CrawlPath` invariants.
    #[must_use]
    pub fn relative(&self) -> CrawlPath {
        if !self.is_absolute() {
            return self.clone();
        }

        let rest = self
            .0
            .strip_prefix(ROOT)
            .expect("absolute paths start with ROOT");

        let rest = rest.strip_prefix('/').unwrap_or(rest);

        CrawlPath(rest.to_string())
    }
}

pub enum Hasher {
    Md5(md5::Md5),
    Sha256(sha2::Sha256),
    Sha1(sha1::Sha1),
}

impl Hasher {
    pub fn update(&mut self, data: &[u8]) {
        match self {
            Hasher::Md5(h) => h.update(data),
            Hasher::Sha256(h) => h.update(data),
            Hasher::Sha1(h) => h.update(data),
        }
    }

    #[must_use]
    pub fn finalize(self) -> Vec<u8> {
        match self {
            Hasher::Md5(h) => h.finalize().to_vec(),
            Hasher::Sha256(h) => h.finalize().to_vec(),
            Hasher::Sha1(h) => h.finalize().to_vec(),
        }
    }
}

#[allow(clippy::large_enum_variant)]
#[derive(Debug)]
pub enum Entry {
    Dir(DirMeta),
    File(FileMeta),
}

#[derive(Debug, Clone)]
pub struct DirMeta {
    path: CrawlPath,
    root_url: Url,
    api_url: Url,
}

impl std::fmt::Display for DirMeta {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "DirMeta (at: {}, src: {}, src_root: {})",
            self.path,
            self.api_url.as_str(),
            self.root_url.as_str(),
        )
    }
}

impl DirMeta {
    #[must_use]
    pub fn new(path: CrawlPath, api_url: Url, root_url: Url) -> Self {
        DirMeta {
            path,
            root_url,
            api_url,
        }
    }

    #[must_use]
    pub fn new_root(api_url: &Url) -> Self {
        DirMeta {
            path: CrawlPath(ROOT.to_string()),
            api_url: api_url.clone(),
            root_url: api_url.clone(),
        }
    }

    #[must_use]
    pub fn path(&self) -> CrawlPath {
        self.path.clone()
    }

    #[must_use]
    pub fn root_url(&self) -> Url {
        self.root_url.clone()
    }

    #[must_use]
    pub fn api_url(&self) -> Url {
        self.api_url.clone()
    }

    #[must_use]
    pub fn relative(&self) -> CrawlPath {
        self.path.relative()
    }

    #[must_use]
    pub fn join(&self, p: &str) -> CrawlPath {
        self.path.join(p)
    }
}

// TODO: this should support both xml and json to re-locate where the entry is defined
#[derive(Debug, Clone)]
pub struct Endpoint {
    pub parent_url: Url,
    pub key: Option<String>,
}

impl std::fmt::Display for Endpoint {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "Endpoint (parent_url: {}, key: {})",
            self.parent_url.as_str(),
            self.key.clone().unwrap_or("<Null>".to_string())
        )
    }
}

// TODO: `FileMetaByScan` will include the full accurate mimetype and size and checksum.

/// Metadata describing a crawled file.
///
/// The `mimetype` is taken directly from the API response and is not
/// validated against the file contents. As a result, it may be incorrect.
/// For example, some APIs infer MIME types from file extensions rather
/// than inspecting the actual data.
#[derive(Debug)]
pub struct FileMeta {
    filename: Option<String>,
    file_identifier: Option<String>,
    path: CrawlPath,
    endpoint: Endpoint,
    download_url: Url,
    size: Option<u64>,
    checksum: Vec<Checksum>,
    mimetype: Option<Mime>,
    version: Option<String>,
    creation_date: Option<String>,
    last_modification_date: Option<String>,
    downloadable: bool,
}

impl FileMeta {
    pub fn filename(&self) -> Option<&str> {
        self.filename.as_deref()
    }

    pub fn file_identifier(&self) -> Option<&str> {
        self.file_identifier.as_deref()
    }

    /// Returns whether the file can be downloaded.
    pub fn is_downloadable(&self) -> bool {
        self.downloadable
    }

    /// Returns the crawl path of the file.
    pub fn path(&self) -> CrawlPath {
        self.path.clone()
    }

    /// Returns the download URL of the file.
    pub fn download_url(&self) -> Url {
        self.download_url.clone()
    }

    /// Returns the checksums associated with the file.
    pub fn checksum(&self) -> &[Checksum] {
        &self.checksum
    }

    /// Returns the file size in bytes if known.
    pub fn size(&self) -> Option<u64> {
        self.size
    }

    pub fn version(&self) -> Option<&str> {
        self.version.as_deref()
    }

    /// Returns the mimetype in bytes if known.
    pub fn mimetype(&self) -> Option<Mime> {
        self.mimetype.clone()
    }

    pub fn creation_date(&self) -> Option<&str> {
        self.creation_date.as_deref()
    }

    pub fn last_modification_date(&self) -> Option<&str> {
        self.last_modification_date.as_deref()
    }
}

impl std::fmt::Display for FileMeta {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let size_str = self
            .size
            .map_or("<unknown>".to_string(), |s| format!("{s} bytes"));

        let checksum_str = if self.checksum.is_empty() {
            "<none>".to_string()
        } else {
            self.checksum
                .iter()
                .map(|c| format!("{c}"))
                .collect::<Vec<_>>()
                .join(", ")
        };

        let mimetype_str = self
            .mimetype
            .as_ref()
            .map_or("<unknown>".to_string(), std::string::ToString::to_string);

        writeln!(f, "📄 FileMeta:")?;
        writeln!(f, "  Path       : {}", self.path)?;
        writeln!(f, "  Endpoint   : {}", self.endpoint)?;
        writeln!(f, "  Download   : {}", self.download_url)?;
        writeln!(f, "  Size       : {size_str}")?;
        writeln!(f, "  Mime Type  : {mimetype_str}")?;
        writeln!(f, "  Checksums  : {checksum_str}")?;

        Ok(())
    }
}

impl FileMeta {
    #[allow(clippy::too_many_arguments)]
    #[must_use]
    pub fn new(
        filename: Option<String>,
        file_identifier: Option<String>,
        path: CrawlPath,
        endpoint: Endpoint,
        download_url: Url,
        size: Option<u64>,
        checksum: Vec<Checksum>,
        mimetype: Option<Mime>,
        version: Option<String>,
        creation_date: Option<String>,
        last_modification_date: Option<String>,
        downloadable: bool,
    ) -> Self {
        FileMeta {
            filename,
            file_identifier,
            path,
            endpoint,
            download_url,
            size,
            checksum,
            mimetype,
            version,
            creation_date,
            last_modification_date,
            downloadable,
        }
    }
    #[must_use]
    pub fn relative(&self) -> CrawlPath {
        self.path.relative()
    }
    #[must_use]
    pub fn endpoint(&self) -> Endpoint {
        self.endpoint.clone()
    }
}

// XXX: github blob didnt validate, it use sha1 but compute (maybe) with 'blob {}' as prefix of
// content. I lean to not validate github downloads for simplicity. Only do it when requests come.
#[derive(Debug, Clone)]
pub enum Checksum {
    Md5(String),
    Sha256(String),
    Sha1(String),
}

impl std::fmt::Display for Checksum {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Checksum::Md5(h) => write!(f, "(md5: {h})"),
            Checksum::Sha256(h) => write!(f, "(sha256: {h})"),
            Checksum::Sha1(h) => write!(f, "(sha1: {h})"),
        }
    }
}

#[derive(Debug)]
pub struct RepoError {
    pub message: String,
}

impl std::fmt::Display for RepoError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "repo fail: {}", self.message)
    }
}

impl std::error::Error for RepoError {}

#[async_trait]
pub trait DatasetBackend: Send + Sync + Any {
    async fn list(&self, client: &Client, dir: DirMeta) -> Result<Vec<Entry>, Exn<RepoError>>;
    fn root_url(&self) -> Url;
    fn as_any(&self) -> &dyn Any;
}

#[derive(Clone)]
pub struct Dataset {
    pub backend: Arc<dyn DatasetBackend>,
}

impl Dataset {
    #[must_use]
    pub fn new(backend: impl DatasetBackend) -> Self {
        Dataset {
            backend: Arc::new(backend),
        }
    }
    #[must_use]
    pub fn root_dir(&self) -> DirMeta {
        DirMeta::new_root(&self.backend.root_url())
    }
}