gdown-core 0.1.0

Core download logic for Google Drive
Documentation
//! URL parsing for Google Drive links
//!
//! Supports various Google Drive URL formats:
//! - https://drive.google.com/uc?id=<id>
//! - https://drive.google.com/file/d/<id>/view
//! - https://drive.google.com/open?id=<id>
//! - https://docs.google.com/document/d/<id>/edit
//! - https://docs.google.com/spreadsheets/d/<id>/edit
//! - https://docs.google.com/presentation/d/<id>/edit

use crate::error::{GdownError, Result};
use regex::Regex;

/// Google Drive file ID (typically 26 alphanumeric characters)
pub type FileId = String;

/// Parse a Google Drive URL and extract the file ID.
///
/// # Arguments
///
/// * `url` - A Google Drive URL string
///
/// # Returns
///
/// * `Ok((Some(file_id), is_download_link))` - Successfully parsed
/// * `Ok((None, false))` - Not a Google Drive URL
///
/// # Examples
///
/// ```
/// use gdown_core::parse_url;
/// let (id, is_dl) = parse_url("https://drive.google.com/file/d/1l_5RK28JRL19wpT22B-DY9We3TVXnnQQ/view").unwrap();
/// assert!(id.is_some());
/// ```
pub fn parse_url(url: &str) -> Result<(Option<FileId>, bool)> {
    // Handle bare file IDs (just the ID, no URL)
    if !url.contains("://") && !url.starts_with("http") {
        return Ok((Some(url.trim().to_string()), false));
    }

    let parsed = url::Url::parse(url).map_err(GdownError::UrlError)?;

    let host = parsed.host_str().unwrap_or_default();
    let is_drive = host == "drive.google.com" || host == "docs.google.com";

    if !is_drive {
        return Ok((None, false));
    }

    let path = parsed.path();
    let is_download_link = path == "/uc" || path.ends_with("/uc");

    // Try to extract file_id from query string first
    let query: std::collections::HashMap<String, String> = parsed
        .query_pairs()
        .map(|(k, v)| (k.to_string(), v.to_string()))
        .collect();

    if let Some(id) = query.get("id") {
        return Ok((Some(id.clone()), is_download_link));
    }

    // Pattern 1: /file/d/<id>/(edit|view)
    let re1 = Regex::new(r"^/file/d/([^/]+)/(edit|view)$").unwrap();
    if let Some(caps) = re1.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 2: /file/u/<digit>+/d/<id>/(edit|view)
    let re2 = Regex::new(r"^/file/u/[0-9]+/d/([^/]+)/(edit|view)$").unwrap();
    if let Some(caps) = re2.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 3: /document/d/<id>/(edit|htmlview|view)
    let re3 = Regex::new(r"^/document/d/([^/]+)/(edit|htmlview|view)$").unwrap();
    if let Some(caps) = re3.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 4: /document/u/<digit>+/d/<id>/(edit|htmlview|view)
    let re4 = Regex::new(r"^/document/u/[0-9]+/d/([^/]+)/(edit|htmlview|view)$").unwrap();
    if let Some(caps) = re4.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 5: /presentation/d/<id>/(edit|htmlview|view)
    let re5 = Regex::new(r"^/presentation/d/([^/]+)/(edit|htmlview|view)$").unwrap();
    if let Some(caps) = re5.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 6: /presentation/u/<digit>+/d/<id>/(edit|htmlview|view)
    let re6 = Regex::new(r"^/presentation/u/[0-9]+/d/([^/]+)/(edit|htmlview|view)$").unwrap();
    if let Some(caps) = re6.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 7: /spreadsheets/d/<id>/(edit|htmlview|view)
    let re7 = Regex::new(r"^/spreadsheets/d/([^/]+)/(edit|htmlview|view)$").unwrap();
    if let Some(caps) = re7.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 8: /spreadsheets/u/<digit>+/d/<id>/(edit|htmlview|view)
    let re8 = Regex::new(r"^/spreadsheets/u/[0-9]+/d/([^/]+)/(edit|htmlview|view)$").unwrap();
    if let Some(caps) = re8.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 9: /drive/folders/<id>
    let re9 = Regex::new(r"^/drive/folders/([^/]+)").unwrap();
    if let Some(caps) = re9.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    // Pattern 10: /drive/folders/<id>/view (with view suffix)
    let re10 = Regex::new(r"^/drive/folders/([^/]+)/view$").unwrap();
    if let Some(caps) = re10.captures(path) {
        return Ok((Some(caps.get(1).unwrap().as_str().to_string()), false));
    }

    Ok((None, is_download_link))
}

/// Check if URL is a Google Drive URL
pub fn is_google_drive_url(url: &str) -> bool {
    if let Ok((id, _)) = parse_url(url) {
        id.is_some()
    } else {
        false
    }
}

/// Build a download URL from a file ID
pub fn build_download_url(file_id: &str) -> String {
    format!("https://drive.google.com/uc?id={}&export=download", file_id)
}

/// Build an export URL for Google Docs/Sheets/Slides
pub fn build_export_url(file_id: &str, format: &str) -> String {
    format!(
        "https://docs.google.com/document/d/{}/export?format={}",
        file_id, format
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    // Real file IDs from gdown-main tests
    const REAL_FILE_ID: &str = "0B_NiLAzvehC9R2stRmQyM3ZiVjQ";
    const REAL_FILE_ID2: &str = "0B9P1L--7Wd2vU3VUVlFnbTgtS2c";
    const REAL_FOLDER_ID: &str = "15uNXeRBIhVvZJIhL4yTw4IsStMhUaaxl";
    const REAL_GOOGLE_DOC_ID: &str = "1DvsG277pWa4WMssXjD9qYYAdF51y7hVidZ6eklfq480";

    #[test]
    fn test_parse_google_open() {
        // https://drive.google.com/open?id=0B_NiLAzvehC9R2stRmQyM3ZiVjQ from test_parse_url.py
        let url = format!("https://drive.google.com/open?id={}", REAL_FILE_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FILE_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_uc_download_link() {
        // https://drive.google.com/uc?id=0B_NiLAzvehC9R2stRmQyM3ZiVjQ from test_parse_url.py
        let url = format!("https://drive.google.com/uc?id={}", REAL_FILE_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FILE_ID.to_string()));
        assert!(is_dl);
    }

    #[test]
    fn test_parse_file_view_link() {
        // https://drive.google.com/file/d/0B9P1L--7Wd2vU3VUVlFnbTgtS2c/view?usp=sharing from test_download.py
        let url = format!("https://drive.google.com/file/d/{}/view?usp=sharing", REAL_FILE_ID2);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FILE_ID2.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_subdomain_uc_link() {
        // https://drive.google.com/a/jsk.imi.i.u-tokyo.ac.jp/uc?id=0B_NiLAzvehC9R2stRmQyM3ZiVjQ&export=download
        let url = format!("https://drive.google.com/a/jsk.imi.i.u-tokyo.ac.jp/uc?id={}&export=download", REAL_FILE_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FILE_ID.to_string()));
        assert!(is_dl);
    }

    #[test]
    fn test_parse_file_edit_link() {
        let url = format!("https://drive.google.com/file/d/{}/edit", REAL_FILE_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FILE_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_open_link() {
        let url = format!("https://drive.google.com/open?id={}", REAL_FILE_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FILE_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_google_doc_edit() {
        // Real Google Docs URL
        let url = format!("https://docs.google.com/document/d/{}/edit", REAL_GOOGLE_DOC_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_GOOGLE_DOC_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_google_doc_view() {
        let url = format!("https://docs.google.com/document/d/{}/view", REAL_GOOGLE_DOC_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_GOOGLE_DOC_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_google_doc_htmlview() {
        let url = format!("https://docs.google.com/document/d/{}/htmlview", REAL_GOOGLE_DOC_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_GOOGLE_DOC_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_google_sheet_edit() {
        let url = format!("https://docs.google.com/spreadsheets/d/{}/edit", REAL_GOOGLE_DOC_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_GOOGLE_DOC_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_google_slides_edit() {
        // https://docs.google.com/presentation/d/1DvsG277pWa4WMssXjD9qYYAdF51y7hVidZ6eklfq480/edit?usp=drive_link
        let url = format!("https://docs.google.com/presentation/d/{}/edit", REAL_GOOGLE_DOC_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_GOOGLE_DOC_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_folder_link() {
        // https://drive.google.com/drive/folders/15uNXeRBIhVvZJIhL4yTw4IsStMhUaaxl from README
        let url = format!("https://drive.google.com/drive/folders/{}", REAL_FOLDER_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FOLDER_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_file_u_d_link() {
        let url = format!("https://drive.google.com/file/u/0/d/{}/view", REAL_FILE_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_FILE_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_document_u_d_link() {
        let url = format!("https://docs.google.com/document/u/0/d/{}/edit", REAL_GOOGLE_DOC_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_GOOGLE_DOC_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_presentation_u_d_link() {
        let url = format!("https://docs.google.com/presentation/u/0/d/{}/edit", REAL_GOOGLE_DOC_ID);
        let (id, is_dl) = parse_url(&url).unwrap();
        assert_eq!(id, Some(REAL_GOOGLE_DOC_ID.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_parse_non_gdrive_url() {
        // https://github.com/wkentaro/gdown/archive/refs/tags/v4.0.0.tar.gz from conftest.py
        let url = "https://github.com/wkentaro/gdown/archive/refs/tags/v4.0.0.tar.gz";
        let (id, _) = parse_url(url).unwrap();
        assert!(id.is_none());
    }

    #[test]
    fn test_parse_bare_id() {
        let id = REAL_FILE_ID;
        let (result_id, is_dl) = parse_url(id).unwrap();
        assert_eq!(result_id, Some(id.to_string()));
        assert!(!is_dl);
    }

    #[test]
    fn test_build_download_url() {
        let url = build_download_url(REAL_FILE_ID);
        assert_eq!(url, format!("https://drive.google.com/uc?id={}&export=download", REAL_FILE_ID));
    }

    #[test]
    fn test_build_export_url() {
        let url = build_export_url(REAL_FILE_ID, "pdf");
        assert_eq!(url, format!("https://docs.google.com/document/d/{}/export?format=pdf", REAL_FILE_ID));
    }
}