halldyll-media 0.1.0

//! Document extraction for halldyll-media
//!
//! Extracts documents from HTML with support for:
//! - PDF links
//! - Office documents (Word, Excel, PowerPoint)
//! - E-books (EPUB)
//! - Download links

use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;
use url::Url;

use crate::types::{
    DocumentMedia, DocumentType, MediaResult,
};

// ============================================================================
// CONSTANTS
// ============================================================================

/// Document file extensions
const DOCUMENT_EXTENSIONS: &[(&str, DocumentType)] = &[
    ("pdf", DocumentType::Pdf),
    ("doc", DocumentType::Word),
    ("docx", DocumentType::Word),
    ("odt", DocumentType::Word),
    ("rtf", DocumentType::Word),
    ("xls", DocumentType::Excel),
    ("xlsx", DocumentType::Excel),
    ("ods", DocumentType::Excel),
    ("csv", DocumentType::Csv),
    ("ppt", DocumentType::PowerPoint),
    ("pptx", DocumentType::PowerPoint),
    ("odp", DocumentType::PowerPoint),
    ("txt", DocumentType::Text),
    ("epub", DocumentType::Epub),
];

// ============================================================================
// EXTRACTION FUNCTIONS
// ============================================================================

/// Extract all documents from HTML document
pub fn extract_documents(document: &Html, base_url: Option<&Url>) -> Vec<DocumentMedia> {
    let mut documents = Vec::new();
    let mut seen_urls: HashSet<String> = HashSet::new();
    
    // Extract from links
    if let Ok(sel) = Selector::parse("a[href]") {
        for el in document.select(&sel) {
            if let Some(href) = el.value().attr("href") {
                if is_document_url(href) {
                    if let Some(doc) = extract_document_link(&el, base_url) {
                        let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
                        if seen_urls.insert(key) {
                            documents.push(doc);
                        }
                    }
                }
            }
        }
    }
    
    // Extract from download attributes
    if let Ok(sel) = Selector::parse("a[download]") {
        for el in document.select(&sel) {
            if el.value().attr("href").is_some() {
                if let Some(doc) = extract_document_link(&el, base_url) {
                    let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
                    if seen_urls.insert(key) {
                        documents.push(doc);
                    }
                }
            }
        }
    }
    
    // Extract from object/embed for PDFs
    if let Ok(sel) = Selector::parse("object[data*='.pdf'], embed[src*='.pdf']") {
        for el in document.select(&sel) {
            let src = el.value().attr("data").or_else(|| el.value().attr("src"));
            if let Some(src) = src {
                if seen_urls.insert(src.to_string()) {
                    let doc = DocumentMedia {
                        url: src.to_string(),
                        absolute_url: resolve_url(src, base_url),
                        doc_type: DocumentType::Pdf,
                        mime_type: Some("application/pdf".to_string()),
                        ..Default::default()
                    };
                    documents.push(doc);
                }
            }
        }
    }
    
    // Extract from iframes (embedded PDFs)
    if let Ok(sel) = Selector::parse("iframe[src*='.pdf']") {
        for el in document.select(&sel) {
            if let Some(src) = el.value().attr("src") {
                if seen_urls.insert(src.to_string()) {
                    let doc = DocumentMedia {
                        url: src.to_string(),
                        absolute_url: resolve_url(src, base_url),
                        doc_type: DocumentType::Pdf,
                        title: el.value().attr("title").map(|s| s.to_string()),
                        mime_type: Some("application/pdf".to_string()),
                        ..Default::default()
                    };
                    documents.push(doc);
                }
            }
        }
    }
    
    documents
}

/// Extract document from link element
fn extract_document_link(el: &ElementRef, base_url: Option<&Url>) -> Option<DocumentMedia> {
    let href = el.value().attr("href")?;
    let absolute_url = resolve_url(href, base_url);
    
    // Determine document type
    let doc_type = detect_document_type(href);
    
    // Extract filename
    let filename = extract_filename(href);
    
    // Get title from link text or title attribute
    let title = el.value().attr("title")
        .map(|s| s.to_string())
        .or_else(|| {
            let text = el.text().collect::<String>().trim().to_string();
            if !text.is_empty() { Some(text) } else { None }
        })
        .or_else(|| filename.clone());
    
    // Guess MIME type
    let mime_type = guess_document_mime(&doc_type);
    
    Some(DocumentMedia {
        url: href.to_string(),
        absolute_url,
        doc_type,
        filename,
        title,
        mime_type,
        size_bytes: None,
        page_count: None,
    })
}

/// Check if URL points to a document
fn is_document_url(url: &str) -> bool {
    let url_lower = url.to_lowercase();
    DOCUMENT_EXTENSIONS.iter().any(|(ext, _)| {
        url_lower.ends_with(&format!(".{}", ext)) ||
        url_lower.contains(&format!(".{}?", ext)) ||
        url_lower.contains(&format!(".{}&", ext))
    })
}

/// Detect document type from URL
fn detect_document_type(url: &str) -> DocumentType {
    let url_lower = url.to_lowercase();
    
    for (ext, doc_type) in DOCUMENT_EXTENSIONS {
        if url_lower.contains(&format!(".{}", ext)) {
            return *doc_type;
        }
    }
    
    DocumentType::Other
}

/// Extract filename from URL
fn extract_filename(url: &str) -> Option<String> {
    // Remove query string
    let path = url.split('?').next()?;
    
    // Get last segment
    let filename = path.rsplit('/').next()?;
    
    // Decode URL encoding
    let decoded = urlencoding::decode(filename).ok()?;
    
    if decoded.is_empty() {
        None
    } else {
        Some(decoded.into_owned())
    }
}

/// Guess MIME type for document
fn guess_document_mime(doc_type: &DocumentType) -> Option<String> {
    match doc_type {
        DocumentType::Pdf => Some("application/pdf".to_string()),
        DocumentType::Word => Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string()),
        DocumentType::Excel => Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string()),
        DocumentType::PowerPoint => Some("application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string()),
        DocumentType::Text => Some("text/plain".to_string()),
        DocumentType::Csv => Some("text/csv".to_string()),
        DocumentType::Epub => Some("application/epub+zip".to_string()),
        DocumentType::Other => None,
    }
}

/// Resolve relative URL
fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
    if href.starts_with("http://") || href.starts_with("https://") {
        return Some(href.to_string());
    }
    
    if href.starts_with("//") {
        return Some(format!("https:{}", href));
    }
    
    base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
}

// ============================================================================
// CONVENIENCE FUNCTIONS
// ============================================================================

/// Extract documents from HTML string
pub fn extract_documents_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<DocumentMedia>> {
    let document = Html::parse_document(html);
    let base = base_url.and_then(|u| Url::parse(u).ok());
    Ok(extract_documents(&document, base.as_ref()))
}

/// Get all document URLs from HTML
pub fn get_document_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
    extract_documents_from_html(html, base_url)
        .unwrap_or_default()
        .into_iter()
        .filter_map(|d| d.absolute_url)
        .collect()
}

/// Check if HTML has documents
pub fn has_documents(document: &Html) -> bool {
    if let Ok(sel) = Selector::parse("a[href]") {
        document.select(&sel)
            .any(|el| {
                el.value().attr("href")
                    .map(is_document_url)
                    .unwrap_or(false)
            })
    } else {
        false
    }
}

/// Get PDFs only
pub fn get_pdfs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
    documents.iter()
        .filter(|d| d.doc_type == DocumentType::Pdf)
        .collect()
}

/// Get Office documents
pub fn get_office_docs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
    documents.iter()
        .filter(|d| matches!(d.doc_type, 
            DocumentType::Word | DocumentType::Excel | DocumentType::PowerPoint
        ))
        .collect()
}

/// Count documents by type
pub fn count_by_type(documents: &[DocumentMedia]) -> std::collections::HashMap<DocumentType, usize> {
    let mut counts = std::collections::HashMap::new();
    for doc in documents {
        *counts.entry(doc.doc_type).or_insert(0) += 1;
    }
    counts
}

// ============================================================================
// TESTS
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    fn parse_html(html: &str) -> Html {
        Html::parse_document(html)
    }

    #[test]
    fn test_extract_pdf_link() {
        let html = r#"<a href="/documents/report.pdf">Download Report</a>"#;
        let doc = parse_html(html);
        let base = Url::parse("https://example.com").unwrap();
        let documents = extract_documents(&doc, Some(&base));
        
        assert_eq!(documents.len(), 1);
        assert_eq!(documents[0].doc_type, DocumentType::Pdf);
        assert_eq!(documents[0].title, Some("Download Report".to_string()));
        assert_eq!(documents[0].absolute_url, Some("https://example.com/documents/report.pdf".to_string()));
    }

    #[test]
    fn test_extract_word_document() {
        let html = r#"<a href="/files/document.docx" title="Word Document">Download</a>"#;
        let doc = parse_html(html);
        let documents = extract_documents(&doc, None);
        
        assert_eq!(documents.len(), 1);
        assert_eq!(documents[0].doc_type, DocumentType::Word);
        assert_eq!(documents[0].title, Some("Word Document".to_string()));
    }

    #[test]
    fn test_extract_excel_document() {
        let html = r#"<a href="/data/spreadsheet.xlsx">Spreadsheet</a>"#;
        let doc = parse_html(html);
        let documents = extract_documents(&doc, None);
        
        assert_eq!(documents.len(), 1);
        assert_eq!(documents[0].doc_type, DocumentType::Excel);
    }

    #[test]
    fn test_extract_embedded_pdf() {
        let html = r#"<iframe src="/viewer/document.pdf" title="PDF Viewer"></iframe>"#;
        let doc = parse_html(html);
        let base = Url::parse("https://example.com").unwrap();
        let documents = extract_documents(&doc, Some(&base));
        
        assert_eq!(documents.len(), 1);
        assert_eq!(documents[0].doc_type, DocumentType::Pdf);
    }

    #[test]
    fn test_extract_download_attribute() {
        let html = r#"<a href="/files/data.csv" download="export.csv">Export Data</a>"#;
        let doc = parse_html(html);
        let documents = extract_documents(&doc, None);
        
        assert!(!documents.is_empty());
    }

    #[test]
    fn test_is_document_url() {
        assert!(is_document_url("/file.pdf"));
        assert!(is_document_url("/file.docx"));
        assert!(is_document_url("/file.xlsx"));
        assert!(is_document_url("/file.pdf?download=true"));
        assert!(!is_document_url("/page.html"));
        assert!(!is_document_url("/image.jpg"));
    }

    #[test]
    fn test_detect_document_type() {
        assert_eq!(detect_document_type("/file.pdf"), DocumentType::Pdf);
        assert_eq!(detect_document_type("/file.docx"), DocumentType::Word);
        assert_eq!(detect_document_type("/file.xlsx"), DocumentType::Excel);
        assert_eq!(detect_document_type("/file.pptx"), DocumentType::PowerPoint);
        assert_eq!(detect_document_type("/file.txt"), DocumentType::Text);
        assert_eq!(detect_document_type("/file.epub"), DocumentType::Epub);
    }

    #[test]
    fn test_extract_filename() {
        assert_eq!(extract_filename("https://example.com/files/report.pdf"), Some("report.pdf".to_string()));
        assert_eq!(extract_filename("/path/to/document.docx"), Some("document.docx".to_string()));
        assert_eq!(extract_filename("/file.pdf?v=1"), Some("file.pdf".to_string()));
    }

    #[test]
    fn test_has_documents() {
        let with_docs = r#"<a href="file.pdf">PDF</a>"#;
        let without_docs = r#"<a href="/page">Link</a>"#;
        
        assert!(has_documents(&parse_html(with_docs)));
        assert!(!has_documents(&parse_html(without_docs)));
    }

    #[test]
    fn test_get_pdfs() {
        let docs = vec![
            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
            DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
        ];
        
        let pdfs = get_pdfs(&docs);
        assert_eq!(pdfs.len(), 2);
    }

    #[test]
    fn test_get_office_docs() {
        let docs = vec![
            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
            DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
            DocumentMedia { doc_type: DocumentType::Excel, ..Default::default() },
        ];
        
        let office = get_office_docs(&docs);
        assert_eq!(office.len(), 2);
    }

    #[test]
    fn test_guess_document_mime() {
        assert_eq!(guess_document_mime(&DocumentType::Pdf), Some("application/pdf".to_string()));
        assert_eq!(guess_document_mime(&DocumentType::Text), Some("text/plain".to_string()));
    }

    #[test]
    fn test_count_by_type() {
        let docs = vec![
            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
            DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
        ];
        
        let counts = count_by_type(&docs);
        assert_eq!(counts.get(&DocumentType::Pdf), Some(&2));
        assert_eq!(counts.get(&DocumentType::Word), Some(&1));
    }
}