Skip to main content

leann_core/document_loaders/
mod.rs

1//! Document loaders for various file formats.
2//!
3//! Provides text extraction from binary document formats (PDF, etc.)
4//! that cannot be loaded with simple `read_to_string`.
5
6#[cfg(feature = "pdf")]
7pub mod pdf;
8
9use anyhow::Result;
10use std::path::Path;
11
12/// Extract text content from a file, handling both text and binary formats.
13///
14/// For text files, reads the file as UTF-8. For supported binary formats
15/// (PDF when the `pdf` feature is enabled), uses specialized extractors.
16///
17/// Returns `None` if the file format is not supported or extraction fails.
18pub fn extract_text(path: &Path) -> Result<Option<String>> {
19    let ext = path
20        .extension()
21        .map(|e| e.to_string_lossy().to_lowercase())
22        .unwrap_or_default();
23
24    match ext.as_str() {
25        #[cfg(feature = "pdf")]
26        "pdf" => match pdf::extract_pdf_text(path) {
27            Ok(text) if !text.trim().is_empty() => Ok(Some(text)),
28            Ok(_) => {
29                tracing::warn!("PDF has no extractable text: {}", path.display());
30                Ok(None)
31            }
32            Err(e) => {
33                tracing::warn!("Failed to extract text from PDF {}: {}", path.display(), e);
34                Ok(None)
35            }
36        },
37        #[cfg(not(feature = "pdf"))]
38        "pdf" => {
39            tracing::warn!(
40                "PDF support not enabled. Rebuild with `pdf` feature to load: {}",
41                path.display()
42            );
43            Ok(None)
44        }
45        _ => {
46            // Text-based file: try read_to_string
47            match std::fs::read_to_string(path) {
48                Ok(content) if !content.trim().is_empty() => Ok(Some(content)),
49                Ok(_) => Ok(None),
50                Err(e) => {
51                    tracing::debug!("Could not read {} as text: {}", path.display(), e);
52                    Ok(None)
53                }
54            }
55        }
56    }
57}
58
59/// Returns true if the file extension is a binary format that requires
60/// a specialized loader (not `read_to_string`).
61pub fn is_binary_document(path: &Path) -> bool {
62    let ext = path
63        .extension()
64        .map(|e| e.to_string_lossy().to_lowercase())
65        .unwrap_or_default();
66    matches!(ext.as_str(), "pdf")
67}