halldyll_media/
documents.rs

1//! Document extraction for halldyll-media
2//!
3//! Extracts documents from HTML with support for:
4//! - PDF links
5//! - Office documents (Word, Excel, PowerPoint)
6//! - E-books (EPUB)
7//! - Download links
8
9use scraper::{Html, Selector, ElementRef};
10use std::collections::HashSet;
11use url::Url;
12
13use crate::types::{
14    DocumentMedia, DocumentType, MediaResult,
15};
16
17// ============================================================================
18// CONSTANTS
19// ============================================================================
20
21/// Document file extensions
22const DOCUMENT_EXTENSIONS: &[(&str, DocumentType)] = &[
23    ("pdf", DocumentType::Pdf),
24    ("doc", DocumentType::Word),
25    ("docx", DocumentType::Word),
26    ("odt", DocumentType::Word),
27    ("rtf", DocumentType::Word),
28    ("xls", DocumentType::Excel),
29    ("xlsx", DocumentType::Excel),
30    ("ods", DocumentType::Excel),
31    ("csv", DocumentType::Csv),
32    ("ppt", DocumentType::PowerPoint),
33    ("pptx", DocumentType::PowerPoint),
34    ("odp", DocumentType::PowerPoint),
35    ("txt", DocumentType::Text),
36    ("epub", DocumentType::Epub),
37];
38
39// ============================================================================
40// EXTRACTION FUNCTIONS
41// ============================================================================
42
43/// Extract all documents from HTML document
44pub fn extract_documents(document: &Html, base_url: Option<&Url>) -> Vec<DocumentMedia> {
45    let mut documents = Vec::new();
46    let mut seen_urls: HashSet<String> = HashSet::new();
47    
48    // Extract from links
49    if let Ok(sel) = Selector::parse("a[href]") {
50        for el in document.select(&sel) {
51            if let Some(href) = el.value().attr("href") {
52                if is_document_url(href) {
53                    if let Some(doc) = extract_document_link(&el, base_url) {
54                        let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
55                        if seen_urls.insert(key) {
56                            documents.push(doc);
57                        }
58                    }
59                }
60            }
61        }
62    }
63    
64    // Extract from download attributes
65    if let Ok(sel) = Selector::parse("a[download]") {
66        for el in document.select(&sel) {
67            if el.value().attr("href").is_some() {
68                if let Some(doc) = extract_document_link(&el, base_url) {
69                    let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
70                    if seen_urls.insert(key) {
71                        documents.push(doc);
72                    }
73                }
74            }
75        }
76    }
77    
78    // Extract from object/embed for PDFs
79    if let Ok(sel) = Selector::parse("object[data*='.pdf'], embed[src*='.pdf']") {
80        for el in document.select(&sel) {
81            let src = el.value().attr("data").or_else(|| el.value().attr("src"));
82            if let Some(src) = src {
83                if seen_urls.insert(src.to_string()) {
84                    let doc = DocumentMedia {
85                        url: src.to_string(),
86                        absolute_url: resolve_url(src, base_url),
87                        doc_type: DocumentType::Pdf,
88                        mime_type: Some("application/pdf".to_string()),
89                        ..Default::default()
90                    };
91                    documents.push(doc);
92                }
93            }
94        }
95    }
96    
97    // Extract from iframes (embedded PDFs)
98    if let Ok(sel) = Selector::parse("iframe[src*='.pdf']") {
99        for el in document.select(&sel) {
100            if let Some(src) = el.value().attr("src") {
101                if seen_urls.insert(src.to_string()) {
102                    let doc = DocumentMedia {
103                        url: src.to_string(),
104                        absolute_url: resolve_url(src, base_url),
105                        doc_type: DocumentType::Pdf,
106                        title: el.value().attr("title").map(|s| s.to_string()),
107                        mime_type: Some("application/pdf".to_string()),
108                        ..Default::default()
109                    };
110                    documents.push(doc);
111                }
112            }
113        }
114    }
115    
116    documents
117}
118
119/// Extract document from link element
120fn extract_document_link(el: &ElementRef, base_url: Option<&Url>) -> Option<DocumentMedia> {
121    let href = el.value().attr("href")?;
122    let absolute_url = resolve_url(href, base_url);
123    
124    // Determine document type
125    let doc_type = detect_document_type(href);
126    
127    // Extract filename
128    let filename = extract_filename(href);
129    
130    // Get title from link text or title attribute
131    let title = el.value().attr("title")
132        .map(|s| s.to_string())
133        .or_else(|| {
134            let text = el.text().collect::<String>().trim().to_string();
135            if !text.is_empty() { Some(text) } else { None }
136        })
137        .or_else(|| filename.clone());
138    
139    // Guess MIME type
140    let mime_type = guess_document_mime(&doc_type);
141    
142    Some(DocumentMedia {
143        url: href.to_string(),
144        absolute_url,
145        doc_type,
146        filename,
147        title,
148        mime_type,
149        size_bytes: None,
150        page_count: None,
151    })
152}
153
154/// Check if URL points to a document
155fn is_document_url(url: &str) -> bool {
156    let url_lower = url.to_lowercase();
157    DOCUMENT_EXTENSIONS.iter().any(|(ext, _)| {
158        url_lower.ends_with(&format!(".{}", ext)) ||
159        url_lower.contains(&format!(".{}?", ext)) ||
160        url_lower.contains(&format!(".{}&", ext))
161    })
162}
163
164/// Detect document type from URL
165fn detect_document_type(url: &str) -> DocumentType {
166    let url_lower = url.to_lowercase();
167    
168    for (ext, doc_type) in DOCUMENT_EXTENSIONS {
169        if url_lower.contains(&format!(".{}", ext)) {
170            return *doc_type;
171        }
172    }
173    
174    DocumentType::Other
175}
176
177/// Extract filename from URL
178fn extract_filename(url: &str) -> Option<String> {
179    // Remove query string
180    let path = url.split('?').next()?;
181    
182    // Get last segment
183    let filename = path.rsplit('/').next()?;
184    
185    // Decode URL encoding
186    let decoded = urlencoding::decode(filename).ok()?;
187    
188    if decoded.is_empty() {
189        None
190    } else {
191        Some(decoded.into_owned())
192    }
193}
194
195/// Guess MIME type for document
196fn guess_document_mime(doc_type: &DocumentType) -> Option<String> {
197    match doc_type {
198        DocumentType::Pdf => Some("application/pdf".to_string()),
199        DocumentType::Word => Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string()),
200        DocumentType::Excel => Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string()),
201        DocumentType::PowerPoint => Some("application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string()),
202        DocumentType::Text => Some("text/plain".to_string()),
203        DocumentType::Csv => Some("text/csv".to_string()),
204        DocumentType::Epub => Some("application/epub+zip".to_string()),
205        DocumentType::Other => None,
206    }
207}
208
209/// Resolve relative URL
210fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
211    if href.starts_with("http://") || href.starts_with("https://") {
212        return Some(href.to_string());
213    }
214    
215    if href.starts_with("//") {
216        return Some(format!("https:{}", href));
217    }
218    
219    base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
220}
221
222// ============================================================================
223// CONVENIENCE FUNCTIONS
224// ============================================================================
225
226/// Extract documents from HTML string
227pub fn extract_documents_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<DocumentMedia>> {
228    let document = Html::parse_document(html);
229    let base = base_url.and_then(|u| Url::parse(u).ok());
230    Ok(extract_documents(&document, base.as_ref()))
231}
232
233/// Get all document URLs from HTML
234pub fn get_document_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
235    extract_documents_from_html(html, base_url)
236        .unwrap_or_default()
237        .into_iter()
238        .filter_map(|d| d.absolute_url)
239        .collect()
240}
241
242/// Check if HTML has documents
243pub fn has_documents(document: &Html) -> bool {
244    if let Ok(sel) = Selector::parse("a[href]") {
245        document.select(&sel)
246            .any(|el| {
247                el.value().attr("href")
248                    .map(is_document_url)
249                    .unwrap_or(false)
250            })
251    } else {
252        false
253    }
254}
255
256/// Get PDFs only
257pub fn get_pdfs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
258    documents.iter()
259        .filter(|d| d.doc_type == DocumentType::Pdf)
260        .collect()
261}
262
263/// Get Office documents
264pub fn get_office_docs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
265    documents.iter()
266        .filter(|d| matches!(d.doc_type, 
267            DocumentType::Word | DocumentType::Excel | DocumentType::PowerPoint
268        ))
269        .collect()
270}
271
272/// Count documents by type
273pub fn count_by_type(documents: &[DocumentMedia]) -> std::collections::HashMap<DocumentType, usize> {
274    let mut counts = std::collections::HashMap::new();
275    for doc in documents {
276        *counts.entry(doc.doc_type).or_insert(0) += 1;
277    }
278    counts
279}
280
281// ============================================================================
282// TESTS
283// ============================================================================
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    fn parse_html(html: &str) -> Html {
290        Html::parse_document(html)
291    }
292
293    #[test]
294    fn test_extract_pdf_link() {
295        let html = r#"<a href="/documents/report.pdf">Download Report</a>"#;
296        let doc = parse_html(html);
297        let base = Url::parse("https://example.com").unwrap();
298        let documents = extract_documents(&doc, Some(&base));
299        
300        assert_eq!(documents.len(), 1);
301        assert_eq!(documents[0].doc_type, DocumentType::Pdf);
302        assert_eq!(documents[0].title, Some("Download Report".to_string()));
303        assert_eq!(documents[0].absolute_url, Some("https://example.com/documents/report.pdf".to_string()));
304    }
305
306    #[test]
307    fn test_extract_word_document() {
308        let html = r#"<a href="/files/document.docx" title="Word Document">Download</a>"#;
309        let doc = parse_html(html);
310        let documents = extract_documents(&doc, None);
311        
312        assert_eq!(documents.len(), 1);
313        assert_eq!(documents[0].doc_type, DocumentType::Word);
314        assert_eq!(documents[0].title, Some("Word Document".to_string()));
315    }
316
317    #[test]
318    fn test_extract_excel_document() {
319        let html = r#"<a href="/data/spreadsheet.xlsx">Spreadsheet</a>"#;
320        let doc = parse_html(html);
321        let documents = extract_documents(&doc, None);
322        
323        assert_eq!(documents.len(), 1);
324        assert_eq!(documents[0].doc_type, DocumentType::Excel);
325    }
326
327    #[test]
328    fn test_extract_embedded_pdf() {
329        let html = r#"<iframe src="/viewer/document.pdf" title="PDF Viewer"></iframe>"#;
330        let doc = parse_html(html);
331        let base = Url::parse("https://example.com").unwrap();
332        let documents = extract_documents(&doc, Some(&base));
333        
334        assert_eq!(documents.len(), 1);
335        assert_eq!(documents[0].doc_type, DocumentType::Pdf);
336    }
337
338    #[test]
339    fn test_extract_download_attribute() {
340        let html = r#"<a href="/files/data.csv" download="export.csv">Export Data</a>"#;
341        let doc = parse_html(html);
342        let documents = extract_documents(&doc, None);
343        
344        assert!(!documents.is_empty());
345    }
346
347    #[test]
348    fn test_is_document_url() {
349        assert!(is_document_url("/file.pdf"));
350        assert!(is_document_url("/file.docx"));
351        assert!(is_document_url("/file.xlsx"));
352        assert!(is_document_url("/file.pdf?download=true"));
353        assert!(!is_document_url("/page.html"));
354        assert!(!is_document_url("/image.jpg"));
355    }
356
357    #[test]
358    fn test_detect_document_type() {
359        assert_eq!(detect_document_type("/file.pdf"), DocumentType::Pdf);
360        assert_eq!(detect_document_type("/file.docx"), DocumentType::Word);
361        assert_eq!(detect_document_type("/file.xlsx"), DocumentType::Excel);
362        assert_eq!(detect_document_type("/file.pptx"), DocumentType::PowerPoint);
363        assert_eq!(detect_document_type("/file.txt"), DocumentType::Text);
364        assert_eq!(detect_document_type("/file.epub"), DocumentType::Epub);
365    }
366
367    #[test]
368    fn test_extract_filename() {
369        assert_eq!(extract_filename("https://example.com/files/report.pdf"), Some("report.pdf".to_string()));
370        assert_eq!(extract_filename("/path/to/document.docx"), Some("document.docx".to_string()));
371        assert_eq!(extract_filename("/file.pdf?v=1"), Some("file.pdf".to_string()));
372    }
373
374    #[test]
375    fn test_has_documents() {
376        let with_docs = r#"<a href="file.pdf">PDF</a>"#;
377        let without_docs = r#"<a href="/page">Link</a>"#;
378        
379        assert!(has_documents(&parse_html(with_docs)));
380        assert!(!has_documents(&parse_html(without_docs)));
381    }
382
383    #[test]
384    fn test_get_pdfs() {
385        let docs = vec![
386            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
387            DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
388            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
389        ];
390        
391        let pdfs = get_pdfs(&docs);
392        assert_eq!(pdfs.len(), 2);
393    }
394
395    #[test]
396    fn test_get_office_docs() {
397        let docs = vec![
398            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
399            DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
400            DocumentMedia { doc_type: DocumentType::Excel, ..Default::default() },
401        ];
402        
403        let office = get_office_docs(&docs);
404        assert_eq!(office.len(), 2);
405    }
406
407    #[test]
408    fn test_guess_document_mime() {
409        assert_eq!(guess_document_mime(&DocumentType::Pdf), Some("application/pdf".to_string()));
410        assert_eq!(guess_document_mime(&DocumentType::Text), Some("text/plain".to_string()));
411    }
412
413    #[test]
414    fn test_count_by_type() {
415        let docs = vec![
416            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
417            DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
418            DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
419        ];
420        
421        let counts = count_by_type(&docs);
422        assert_eq!(counts.get(&DocumentType::Pdf), Some(&2));
423        assert_eq!(counts.get(&DocumentType::Word), Some(&1));
424    }
425}