1use scraper::{Html, Selector, ElementRef};
10use std::collections::HashSet;
11use url::Url;
12
13use crate::types::{
14 DocumentMedia, DocumentType, MediaResult,
15};
16
17const DOCUMENT_EXTENSIONS: &[(&str, DocumentType)] = &[
23 ("pdf", DocumentType::Pdf),
24 ("doc", DocumentType::Word),
25 ("docx", DocumentType::Word),
26 ("odt", DocumentType::Word),
27 ("rtf", DocumentType::Word),
28 ("xls", DocumentType::Excel),
29 ("xlsx", DocumentType::Excel),
30 ("ods", DocumentType::Excel),
31 ("csv", DocumentType::Csv),
32 ("ppt", DocumentType::PowerPoint),
33 ("pptx", DocumentType::PowerPoint),
34 ("odp", DocumentType::PowerPoint),
35 ("txt", DocumentType::Text),
36 ("epub", DocumentType::Epub),
37];
38
39pub fn extract_documents(document: &Html, base_url: Option<&Url>) -> Vec<DocumentMedia> {
45 let mut documents = Vec::new();
46 let mut seen_urls: HashSet<String> = HashSet::new();
47
48 if let Ok(sel) = Selector::parse("a[href]") {
50 for el in document.select(&sel) {
51 if let Some(href) = el.value().attr("href") {
52 if is_document_url(href) {
53 if let Some(doc) = extract_document_link(&el, base_url) {
54 let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
55 if seen_urls.insert(key) {
56 documents.push(doc);
57 }
58 }
59 }
60 }
61 }
62 }
63
64 if let Ok(sel) = Selector::parse("a[download]") {
66 for el in document.select(&sel) {
67 if el.value().attr("href").is_some() {
68 if let Some(doc) = extract_document_link(&el, base_url) {
69 let key = doc.absolute_url.as_ref().unwrap_or(&doc.url).clone();
70 if seen_urls.insert(key) {
71 documents.push(doc);
72 }
73 }
74 }
75 }
76 }
77
78 if let Ok(sel) = Selector::parse("object[data*='.pdf'], embed[src*='.pdf']") {
80 for el in document.select(&sel) {
81 let src = el.value().attr("data").or_else(|| el.value().attr("src"));
82 if let Some(src) = src {
83 if seen_urls.insert(src.to_string()) {
84 let doc = DocumentMedia {
85 url: src.to_string(),
86 absolute_url: resolve_url(src, base_url),
87 doc_type: DocumentType::Pdf,
88 mime_type: Some("application/pdf".to_string()),
89 ..Default::default()
90 };
91 documents.push(doc);
92 }
93 }
94 }
95 }
96
97 if let Ok(sel) = Selector::parse("iframe[src*='.pdf']") {
99 for el in document.select(&sel) {
100 if let Some(src) = el.value().attr("src") {
101 if seen_urls.insert(src.to_string()) {
102 let doc = DocumentMedia {
103 url: src.to_string(),
104 absolute_url: resolve_url(src, base_url),
105 doc_type: DocumentType::Pdf,
106 title: el.value().attr("title").map(|s| s.to_string()),
107 mime_type: Some("application/pdf".to_string()),
108 ..Default::default()
109 };
110 documents.push(doc);
111 }
112 }
113 }
114 }
115
116 documents
117}
118
119fn extract_document_link(el: &ElementRef, base_url: Option<&Url>) -> Option<DocumentMedia> {
121 let href = el.value().attr("href")?;
122 let absolute_url = resolve_url(href, base_url);
123
124 let doc_type = detect_document_type(href);
126
127 let filename = extract_filename(href);
129
130 let title = el.value().attr("title")
132 .map(|s| s.to_string())
133 .or_else(|| {
134 let text = el.text().collect::<String>().trim().to_string();
135 if !text.is_empty() { Some(text) } else { None }
136 })
137 .or_else(|| filename.clone());
138
139 let mime_type = guess_document_mime(&doc_type);
141
142 Some(DocumentMedia {
143 url: href.to_string(),
144 absolute_url,
145 doc_type,
146 filename,
147 title,
148 mime_type,
149 size_bytes: None,
150 page_count: None,
151 })
152}
153
154fn is_document_url(url: &str) -> bool {
156 let url_lower = url.to_lowercase();
157 DOCUMENT_EXTENSIONS.iter().any(|(ext, _)| {
158 url_lower.ends_with(&format!(".{}", ext)) ||
159 url_lower.contains(&format!(".{}?", ext)) ||
160 url_lower.contains(&format!(".{}&", ext))
161 })
162}
163
164fn detect_document_type(url: &str) -> DocumentType {
166 let url_lower = url.to_lowercase();
167
168 for (ext, doc_type) in DOCUMENT_EXTENSIONS {
169 if url_lower.contains(&format!(".{}", ext)) {
170 return *doc_type;
171 }
172 }
173
174 DocumentType::Other
175}
176
177fn extract_filename(url: &str) -> Option<String> {
179 let path = url.split('?').next()?;
181
182 let filename = path.rsplit('/').next()?;
184
185 let decoded = urlencoding::decode(filename).ok()?;
187
188 if decoded.is_empty() {
189 None
190 } else {
191 Some(decoded.into_owned())
192 }
193}
194
195fn guess_document_mime(doc_type: &DocumentType) -> Option<String> {
197 match doc_type {
198 DocumentType::Pdf => Some("application/pdf".to_string()),
199 DocumentType::Word => Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document".to_string()),
200 DocumentType::Excel => Some("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet".to_string()),
201 DocumentType::PowerPoint => Some("application/vnd.openxmlformats-officedocument.presentationml.presentation".to_string()),
202 DocumentType::Text => Some("text/plain".to_string()),
203 DocumentType::Csv => Some("text/csv".to_string()),
204 DocumentType::Epub => Some("application/epub+zip".to_string()),
205 DocumentType::Other => None,
206 }
207}
208
209fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
211 if href.starts_with("http://") || href.starts_with("https://") {
212 return Some(href.to_string());
213 }
214
215 if href.starts_with("//") {
216 return Some(format!("https:{}", href));
217 }
218
219 base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
220}
221
222pub fn extract_documents_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<DocumentMedia>> {
228 let document = Html::parse_document(html);
229 let base = base_url.and_then(|u| Url::parse(u).ok());
230 Ok(extract_documents(&document, base.as_ref()))
231}
232
233pub fn get_document_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
235 extract_documents_from_html(html, base_url)
236 .unwrap_or_default()
237 .into_iter()
238 .filter_map(|d| d.absolute_url)
239 .collect()
240}
241
242pub fn has_documents(document: &Html) -> bool {
244 if let Ok(sel) = Selector::parse("a[href]") {
245 document.select(&sel)
246 .any(|el| {
247 el.value().attr("href")
248 .map(is_document_url)
249 .unwrap_or(false)
250 })
251 } else {
252 false
253 }
254}
255
256pub fn get_pdfs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
258 documents.iter()
259 .filter(|d| d.doc_type == DocumentType::Pdf)
260 .collect()
261}
262
263pub fn get_office_docs(documents: &[DocumentMedia]) -> Vec<&DocumentMedia> {
265 documents.iter()
266 .filter(|d| matches!(d.doc_type,
267 DocumentType::Word | DocumentType::Excel | DocumentType::PowerPoint
268 ))
269 .collect()
270}
271
272pub fn count_by_type(documents: &[DocumentMedia]) -> std::collections::HashMap<DocumentType, usize> {
274 let mut counts = std::collections::HashMap::new();
275 for doc in documents {
276 *counts.entry(doc.doc_type).or_insert(0) += 1;
277 }
278 counts
279}
280
281#[cfg(test)]
286mod tests {
287 use super::*;
288
289 fn parse_html(html: &str) -> Html {
290 Html::parse_document(html)
291 }
292
293 #[test]
294 fn test_extract_pdf_link() {
295 let html = r#"<a href="/documents/report.pdf">Download Report</a>"#;
296 let doc = parse_html(html);
297 let base = Url::parse("https://example.com").unwrap();
298 let documents = extract_documents(&doc, Some(&base));
299
300 assert_eq!(documents.len(), 1);
301 assert_eq!(documents[0].doc_type, DocumentType::Pdf);
302 assert_eq!(documents[0].title, Some("Download Report".to_string()));
303 assert_eq!(documents[0].absolute_url, Some("https://example.com/documents/report.pdf".to_string()));
304 }
305
306 #[test]
307 fn test_extract_word_document() {
308 let html = r#"<a href="/files/document.docx" title="Word Document">Download</a>"#;
309 let doc = parse_html(html);
310 let documents = extract_documents(&doc, None);
311
312 assert_eq!(documents.len(), 1);
313 assert_eq!(documents[0].doc_type, DocumentType::Word);
314 assert_eq!(documents[0].title, Some("Word Document".to_string()));
315 }
316
317 #[test]
318 fn test_extract_excel_document() {
319 let html = r#"<a href="/data/spreadsheet.xlsx">Spreadsheet</a>"#;
320 let doc = parse_html(html);
321 let documents = extract_documents(&doc, None);
322
323 assert_eq!(documents.len(), 1);
324 assert_eq!(documents[0].doc_type, DocumentType::Excel);
325 }
326
327 #[test]
328 fn test_extract_embedded_pdf() {
329 let html = r#"<iframe src="/viewer/document.pdf" title="PDF Viewer"></iframe>"#;
330 let doc = parse_html(html);
331 let base = Url::parse("https://example.com").unwrap();
332 let documents = extract_documents(&doc, Some(&base));
333
334 assert_eq!(documents.len(), 1);
335 assert_eq!(documents[0].doc_type, DocumentType::Pdf);
336 }
337
338 #[test]
339 fn test_extract_download_attribute() {
340 let html = r#"<a href="/files/data.csv" download="export.csv">Export Data</a>"#;
341 let doc = parse_html(html);
342 let documents = extract_documents(&doc, None);
343
344 assert!(!documents.is_empty());
345 }
346
347 #[test]
348 fn test_is_document_url() {
349 assert!(is_document_url("/file.pdf"));
350 assert!(is_document_url("/file.docx"));
351 assert!(is_document_url("/file.xlsx"));
352 assert!(is_document_url("/file.pdf?download=true"));
353 assert!(!is_document_url("/page.html"));
354 assert!(!is_document_url("/image.jpg"));
355 }
356
357 #[test]
358 fn test_detect_document_type() {
359 assert_eq!(detect_document_type("/file.pdf"), DocumentType::Pdf);
360 assert_eq!(detect_document_type("/file.docx"), DocumentType::Word);
361 assert_eq!(detect_document_type("/file.xlsx"), DocumentType::Excel);
362 assert_eq!(detect_document_type("/file.pptx"), DocumentType::PowerPoint);
363 assert_eq!(detect_document_type("/file.txt"), DocumentType::Text);
364 assert_eq!(detect_document_type("/file.epub"), DocumentType::Epub);
365 }
366
367 #[test]
368 fn test_extract_filename() {
369 assert_eq!(extract_filename("https://example.com/files/report.pdf"), Some("report.pdf".to_string()));
370 assert_eq!(extract_filename("/path/to/document.docx"), Some("document.docx".to_string()));
371 assert_eq!(extract_filename("/file.pdf?v=1"), Some("file.pdf".to_string()));
372 }
373
374 #[test]
375 fn test_has_documents() {
376 let with_docs = r#"<a href="file.pdf">PDF</a>"#;
377 let without_docs = r#"<a href="/page">Link</a>"#;
378
379 assert!(has_documents(&parse_html(with_docs)));
380 assert!(!has_documents(&parse_html(without_docs)));
381 }
382
383 #[test]
384 fn test_get_pdfs() {
385 let docs = vec![
386 DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
387 DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
388 DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
389 ];
390
391 let pdfs = get_pdfs(&docs);
392 assert_eq!(pdfs.len(), 2);
393 }
394
395 #[test]
396 fn test_get_office_docs() {
397 let docs = vec![
398 DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
399 DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
400 DocumentMedia { doc_type: DocumentType::Excel, ..Default::default() },
401 ];
402
403 let office = get_office_docs(&docs);
404 assert_eq!(office.len(), 2);
405 }
406
407 #[test]
408 fn test_guess_document_mime() {
409 assert_eq!(guess_document_mime(&DocumentType::Pdf), Some("application/pdf".to_string()));
410 assert_eq!(guess_document_mime(&DocumentType::Text), Some("text/plain".to_string()));
411 }
412
413 #[test]
414 fn test_count_by_type() {
415 let docs = vec![
416 DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
417 DocumentMedia { doc_type: DocumentType::Pdf, ..Default::default() },
418 DocumentMedia { doc_type: DocumentType::Word, ..Default::default() },
419 ];
420
421 let counts = count_by_type(&docs);
422 assert_eq!(counts.get(&DocumentType::Pdf), Some(&2));
423 assert_eq!(counts.get(&DocumentType::Word), Some(&1));
424 }
425}