use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;
use url::Url;
use crate::types::{
DocumentMedia, DocumentType, MediaResult,
};
const DOCUMENT_EXTENSIONS: &[(&str, DocumentType)] = &[
("pdf", DocumentType::Pdf),
("doc", DocumentType::Word),
("docx", DocumentType::Word),
("odt", DocumentType::Word),
("rtf", DocumentType::Word),
("xls", DocumentType::Excel),
("xlsx", DocumentType::Excel),
("ods", DocumentType::Excel),
("csv", DocumentType::Csv),
("ppt", DocumentType::PowerPoint),
("pptx", DocumentType::PowerPoint),
("odp", DocumentType::PowerPoint),
("txt", DocumentType::Text),
("epub", DocumentType::Epub),
];
pub fn extract_documents(document: &Html, base_url: Option<&Url>) -> Vec<DocumentMedia> {
let mut documents = Vec::new();
let mut seen_urls: HashSet<String> = HashSet::new();
if let Ok(sel) = Selector::parse("a[href]") {
for el in document.select(&sel) {
if let Some(href) = el.value().attr("href") {
if is_document_url(href) {
if let Some(doc) = extract_document_link(&el, base_url) {
let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
if seen_urls.insert(key) {
documents.push(doc);
}
}
}
}
}
}
if let Ok(sel) = Selector::parse("a[download]") {
for el in document.select(&sel) {
if el.value().attr("href").is_some() {
if let Some(doc) = extract_document_link(&el, base_url) {
let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
if seen_urls.insert(key) {
documents.push(doc);
}
}
}
}
}
if let Ok(sel) = Selector::parse("object[data*='.pdf'], embed[src*='.pdf']") {
for el in document.select(&sel) {
let src = el.value().attr("data").or_else(|| el.value().attr("src"));
if let Some(src) = src {
if seen_urls.insert(src.to_string()) {
let doc = DocumentMedia {
url: src.to_string(),
absolute_url: resolve_url(src, base_url),
doc_type: DocumentType::Pdf,
mime_type: Some("application/pdf".to_string()),
..Default::default()
};
documents.push(doc);
}
}
}
}
if let Ok(sel) = Selector::parse("iframe[src*='.pdf']") {
for el in document.select(&sel) {
if let Some(src) = el.value().attr("src") {
if seen_urls.insert(src.to_string()) {
let doc = DocumentMedia {
url: src.to_string(),
absolute_url: resolve_url(src, base_url),
doc_type: DocumentType::Pdf,
title: el.value().attr("title").map(|s| s.to_string()),
mime_type: Some("application/pdf".to_string()),
..Default::default()
};
documents.push(doc);
}
}
}
}
documents
}
fn extract_document_link(el: &ElementRef, base_url: Option<&Url>) -> Option<DocumentMedia> {
let href = el.value().attr("href")?;
let absolute_url = resolve_url(href, base_url);
let doc_type = detect_document_type(href);
let filename = extract_filename(href);
let title = el.value().attr("title")
.map(|s| s.to_string())
.or_else(|| {
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() { Some(text) } else { None }
})
.or_else(|| filename.clone());
let mime_type = guess_document_mime(&doc_type);
Some(DocumentMedia {
url: href.to_string(),
absolute_url,
doc_type,
filename,
title,
mime_type,
size_bytes: None,
page_count: None,
})
}
fn is_document_url(url: &str) -> bool {
let url_lower = url.to_lowercase();
DOCUMENT_EXTENSIONS.iter().any(|(ext, _)| {
url_lower.ends_with(&format!(".{}", ext)) ||
url_lower.contains(&format!(".{}?", ext)) ||
url_lower.contains(&format!(".{}&", ext))
})
}
fn detect_document_type(url: &str) -> DocumentType {
let url_lower = url.to_lowercase();
for (ext, doc_type) in DOCUMENT_EXTENSIONS {
if url_lower.contains(&format!(".{}", ext)) {
return *doc_type;
}
}
DocumentType::Other
}
fn extract_filename(url: &str) -> Option<String> {
let path = url.split('?').next()?;
let filename = path.rsplit('/').next()?;
let decoded = urlencoding::decode(filename).ok()?;
if decoded.is_empty() {
None
} else {
Some(decoded.into_owned())
}
}
fn guess_document_mime(doc_type: &DocumentType) -> Option<String> {
match doc_type {
DocumentType::Pdf => Some("application/pdf".to_string()),
DocumentType::Word => Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string()),
DocumentType::Excel => Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string()),
DocumentType::PowerPoint => Some("application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string()),
DocumentType::Text => Some("text/plain".to_string()),
DocumentType::Csv => Some("text/csv".to_string()),
DocumentType::Epub => Some("application/epub+zip".to_string()),
DocumentType::Other => None,
}
}
fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
if href.starts_with("http://") || href.starts_with("https://") {
return Some(href.to_string());
}
if href.starts_with("//") {
return Some(format!("https:{}", href));
}
base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
}
pub fn extract_documents_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<DocumentMedia>> {
let document = Html::parse_document(html);
let base = base_url.and_then(|u| Url::parse(u).ok());
Ok(extract_documents(&document, base.as_ref()))
}
pub fn get_document_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
extract_documents_from_html(html, base_url)
.unwrap_or_default()
.into_iter()
.filter_map(|d| d.absolute_url)
.collect()
}
pub fn has_documents(document: &Html) -> bool {
if let Ok(sel) = Selector::parse("a[href]") {
document.select(&sel)
.any(|el| {
el.value().attr("href")
.map(is_document_url)
.unwrap_or(false)
})
} else {
false
}
}
pub fn get_pdfs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
documents.iter()
.filter(|d| d.doc_type == DocumentType::Pdf)
.collect()
}
pub fn get_office_docs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
documents.iter()
.filter(|d| matches!(d.doc_type,
DocumentType::Word | DocumentType::Excel | DocumentType::PowerPoint
))
.collect()
}
pub fn count_by_type(documents: &[DocumentMedia]) -> std::collections::HashMap<DocumentType, usize> {
let mut counts = std::collections::HashMap::new();
for doc in documents {
*counts.entry(doc.doc_type).or_insert(0) += 1;
}
counts
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_html(html: &str) -> Html {
Html::parse_document(html)
}
#[test]
fn test_extract_pdf_link() {
let html = r#"<a href="/documents/report.pdf">Download Report</a>"#;
let doc = parse_html(html);
let base = Url::parse("https://example.com").unwrap();
let documents = extract_documents(&doc, Some(&base));
assert_eq!(documents.len(), 1);
assert_eq!(documents[0].doc_type, DocumentType::Pdf);
assert_eq!(documents[0].title, Some("Download Report".to_string()));
assert_eq!(documents[0].absolute_url, Some("https://example.com/documents/report.pdf".to_string()));
}
#[test]
fn test_extract_word_document() {
let html = r#"<a href="/files/document.docx" title="Word Document">Download</a>"#;
let doc = parse_html(html);
let documents = extract_documents(&doc, None);
assert_eq!(documents.len(), 1);
assert_eq!(documents[0].doc_type, DocumentType::Word);
assert_eq!(documents[0].title, Some("Word Document".to_string()));
}
#[test]
fn test_extract_excel_document() {
let html = r#"<a href="/data/spreadsheet.xlsx">Spreadsheet</a>"#;
let doc = parse_html(html);
let documents = extract_documents(&doc, None);
assert_eq!(documents.len(), 1);
assert_eq!(documents[0].doc_type, DocumentType::Excel);
}
#[test]
fn test_extract_embedded_pdf() {
let html = r#"<iframe src="/viewer/document.pdf" title="PDF Viewer"></iframe>"#;
let doc = parse_html(html);
let base = Url::parse("https://example.com").unwrap();
let documents = extract_documents(&doc, Some(&base));
assert_eq!(documents.len(), 1);
assert_eq!(documents[0].doc_type, DocumentType::Pdf);
}
#[test]
fn test_extract_download_attribute() {
let html = r#"<a href="/files/data.csv" download="export.csv">Export Data</a>"#;
let doc = parse_html(html);
let documents = extract_documents(&doc, None);
assert!(!documents.is_empty());
}
#[test]
fn test_is_document_url() {
assert!(is_document_url("/file.pdf"));
assert!(is_document_url("/file.docx"));
assert!(is_document_url("/file.xlsx"));
assert!(is_document_url("/file.pdf?download=true"));
assert!(!is_document_url("/page.html"));
assert!(!is_document_url("/image.jpg"));
}
#[test]
fn test_detect_document_type() {
assert_eq!(detect_document_type("/file.pdf"), DocumentType::Pdf);
assert_eq!(detect_document_type("/file.docx"), DocumentType::Word);
assert_eq!(detect_document_type("/file.xlsx"), DocumentType::Excel);
assert_eq!(detect_document_type("/file.pptx"), DocumentType::PowerPoint);
assert_eq!(detect_document_type("/file.txt"), DocumentType::Text);
assert_eq!(detect_document_type("/file.epub"), DocumentType::Epub);
}
#[test]
fn test_extract_filename() {
assert_eq!(extract_filename("https://example.com/files/report.pdf"), Some("report.pdf".to_string()));
assert_eq!(extract_filename("/path/to/document.docx"), Some("document.docx".to_string()));
assert_eq!(extract_filename("/file.pdf?v=1"), Some("file.pdf".to_string()));
}
#[test]
fn test_has_documents() {
let with_docs = r#"<a href="file.pdf">PDF</a>"#;
let without_docs = r#"<a href="/page">Link</a>"#;
assert!(has_documents(&parse_html(with_docs)));
assert!(!has_documents(&parse_html(without_docs)));
}
#[test]
fn test_get_pdfs() {
let docs = vec![
DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
];
let pdfs = get_pdfs(&docs);
assert_eq!(pdfs.len(), 2);
}
#[test]
fn test_get_office_docs() {
let docs = vec![
DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
DocumentMedia { doc_type: DocumentType::Excel, ..Default::default() },
];
let office = get_office_docs(&docs);
assert_eq!(office.len(), 2);
}
#[test]
fn test_guess_document_mime() {
assert_eq!(guess_document_mime(&DocumentType::Pdf), Some("application/pdf".to_string()));
assert_eq!(guess_document_mime(&DocumentType::Text), Some("text/plain".to_string()));
}
#[test]
fn test_count_by_type() {
let docs = vec![
DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
];
let counts = count_by_type(&docs);
assert_eq!(counts.get(&DocumentType::Pdf), Some(&2));
assert_eq!(counts.get(&DocumentType::Word), Some(&1));
}
}