zippyst 5.0.0

Resolve zippyshare.com download links
Documentation
use std::{
    fmt::{Display, Formatter},
    str,
};

use boa_engine::Context;
use http::Uri;
#[cfg(feature = "fetch")]
use hyper::body::HttpBody;
#[cfg(feature = "fetch")]
use hyper::client::Client;
#[cfg(feature = "fetch")]
use hyper::{header, Body, Response};
#[cfg(feature = "fetch")]
use hyper_tls::HttpsConnector;
use lazy_static::lazy_static;
use percent_encoding::percent_decode_str;
use regex::Regex;

use crate::error::Error;

lazy_static! {
    static ref SCRIPT_REGEX: Regex =
        Regex::new(r#"(?s)<script type="text/javascript">(.+?)</script>"#)
            .expect("cannot build script extraction regex");
    static ref OMG_SPAN_REGEX: Regex =
        Regex::new(r#"<span id="omg"\s+class="(\d+)"\s(:?style=".*")?>\s*</span>"#)
            .expect("cannot build span extraction regex");
    static ref PATH_REGEX: Regex =
        Regex::new(r#"/d/(\w+)/(\d+)/([/\w%.-]+)"#).expect("cannot build uri regex");
}

#[derive(Clone, Debug)]
pub struct File {
    pub domain: String,
    pub id: String,
    pub key: i64,
    pub name: String,
    pub encoded_name: String,
}

impl File {
    #[cfg(feature = "fetch")]
    pub async fn fetch_and_parse(uri: Uri) -> Result<Self, Error> {
        async fn fetch(uri: Uri) -> Result<Response<Body>, Error> {
            let https = HttpsConnector::new();
            let client = Client::builder().build::<_, Body>(https);
            let response = client
                .get(uri)
                .await
                .map_err(|err| Error::ContentFetchingFailure { source: err })?;

            if !(response.status().is_success() || response.status().is_redirection()) {
                return Err(Error::InvalidStatusCode {
                    code: response.status(),
                });
            }
            Ok(response)
        }

        let mut response = fetch(uri.clone()).await?;
        // Follow only one redirection.
        if response.status().is_redirection() {
            let location = response
                .headers()
                .get(header::LOCATION)
                .ok_or(Error::RedirectionFailure)?
                .to_str()
                .map_err(|_| Error::RedirectionFailure)?
                .parse()
                .map_err(|err| Error::InvalidUrl { source: err })?;
            response = fetch(location).await?;
        }

        // Final response.
        if !response.status().is_success() {
            return Err(Error::InvalidStatusCode {
                code: response.status(),
            });
        }

        let mut page_content = Vec::new();
        while let Some(next) = response.data().await {
            let chunk = next.map_err(|err| Error::ContentStreamingFailure { source: err })?;
            page_content.extend_from_slice(&chunk);
        }

        Self::parse(
            &uri,
            str::from_utf8(&page_content)
                .map_err(|err| Error::InvalidUtf8PageContent { source: err })?,
        )
    }

    pub fn parse(uri: &Uri, page_content: &str) -> Result<Self, Error> {
        let scripts = SCRIPT_REGEX
            .captures_iter(page_content)
            .filter_map(|script| {
                let inner = script.get(1)?.as_str().trim();
                inner
                    .contains("document.getElementById('dlbutton')")
                    .then_some(inner)
            })
            .collect::<Vec<_>>();
        if scripts.is_empty() {
            return Err(Error::ScriptNotFound);
        }

        let script_content = scripts
            .iter()
            .enumerate()
            .filter_map(|(i, s)| (!scripts[i + 1..].contains(s)).then_some(s))
            .copied()
            .collect::<Vec<_>>()
            .join("\n");

        let mut context = Context::default();
        let mut modified_script_content = format!(
            "{}\n{}\n{}\n{}",
            "let button = {};",
            "let fimage = {};",
            script_content
                .replace("document.getElementById('dlbutton')", "button")
                .replace("document.getElementById('fimage')", "fimage"),
            "button.href"
        );

        if script_content.contains("document.getElementById('omg').getAttribute('class')") {
            modified_script_content = modified_script_content.replace(
                "document.getElementById('omg').getAttribute('class')",
                OMG_SPAN_REGEX
                    .captures(page_content)
                    .ok_or(Error::LinkComputationFailure)?
                    .get(1)
                    .ok_or(Error::LinkComputationFailure)?
                    .as_str(),
            )
        }

        let path = context
            .eval(modified_script_content)
            .map_err(|_| Error::LinkComputationFailure)?
            .to_string(&mut context)
            .map_err(|_| Error::LinkComputationFailure)?;
        let groups = PATH_REGEX
            .captures(&*path)
            .ok_or(Error::LinkGeneratorExtractionFailure)?;

        Ok(Self {
            domain: uri.host().ok_or(Error::DomainExtractionFailure)?.to_owned(),
            id: groups
                .get(1)
                .ok_or(Error::FileIdExtractionFailure)?
                .as_str()
                .to_owned(),
            key: groups
                .get(2)
                .ok_or(Error::FileKeyExtractionFailure)?
                .as_str()
                .to_owned()
                .parse()
                .map_err(|_| Error::FileKeyExtractionFailure)?,
            name: percent_decode_str(
                groups
                    .get(3)
                    .ok_or(Error::FilenameExtractionFailure)?
                    .as_str(),
            )
            .decode_utf8()
            .map_err(|err| Error::InvalidUtf8Filename { source: err })?
            .to_string(),
            encoded_name: groups
                .get(3)
                .ok_or(Error::FilenameExtractionFailure)?
                .as_str()
                .to_owned(),
        })
    }

    pub fn link(&self) -> String {
        format!(
            "https://{}/d/{}/{}/DOWNLOAD",
            self.domain, self.id, self.key
        )
    }

    pub fn link_with_encoded_name(&self) -> String {
        format!(
            "https://{}/d/{}/{}/{}",
            self.domain, self.id, self.key, self.encoded_name
        )
    }
}

impl Display for File {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.link_with_encoded_name())
    }
}

#[cfg(test)]
mod tests {
    use hyper::{body::HttpBody, Client};
    use hyper_tls::HttpsConnector;
    use md5::Context;
    use regex::Regex;

    use super::File;

    fn match_direct_download_format(file: &File) -> bool {
        Regex::new(r#"https://(?:w+\d+\.)?zippyshare\.com/d/[\w\d]+/\d+/DOWNLOAD"#)
            .unwrap()
            .is_match(&file.link())
    }

    #[tokio::test]
    async fn file_link() {
        let file = File::fetch_and_parse(
            "https://www3.zippyshare.com/v/CDCi2wVT/file.html"
                .parse()
                .unwrap(),
        )
        .await
        .unwrap();
        assert!(match_direct_download_format(&file));
    }

    #[tokio::test]
    async fn file_checksum() {
        let file = File::fetch_and_parse(
            "https://www3.zippyshare.com/v/CDCi2wVT/file.html"
                .parse()
                .unwrap(),
        )
        .await
        .unwrap();

        let https = HttpsConnector::new();
        let client = Client::builder().build::<_, hyper::Body>(https);
        let mut response = client
            .get(file.link_with_encoded_name().parse().unwrap())
            .await
            .unwrap();
        assert!(response.status().is_success());

        let mut md5 = Context::new();
        while let Some(next) = response.data().await {
            md5.consume(&next.unwrap());
        }
        assert_eq!(
            md5.compute().0,
            [111, 29, 61, 152, 64, 180, 174, 33, 189, 191, 48, 97, 160, 9, 91, 63],
        );
    }

    #[test]
    fn old_formats() {
        for format in [
            include_str!("../page_payloads/2022_07_18.html"),
            include_str!("../page_payloads/2022_07_23.html"),
            include_str!("../page_payloads/2023_02_28.html"),
        ] {
            assert!(match_direct_download_format(
                &File::parse(
                    &"https://www3.zippyshare.com/v/CDCi2wVT/file.html"
                        .parse()
                        .unwrap(),
                    format
                )
                .unwrap()
            ));
        }
    }
}