#![cfg(feature = "pdf-parsing")]
use anyhow::Result;
use serde::{Deserialize, Serialize};
pub use lopdf;
pub use pdf_extract;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PdfConfig {
pub preserve_layout: bool,
pub extract_metadata: bool,
pub enable_ocr: bool,
pub max_pages: usize,
}
impl Default for PdfConfig {
fn default() -> Self {
Self {
preserve_layout: true,
extract_metadata: true,
enable_ocr: false,
max_pages: 0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedDocument {
pub metadata: DocumentMetadata,
pub pages: Vec<ExtractedPage>,
pub full_text: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DocumentMetadata {
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub keywords: Vec<String>,
pub creator: Option<String>,
pub producer: Option<String>,
pub creation_date: Option<String>,
pub modification_date: Option<String>,
pub page_count: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedPage {
pub page_number: usize,
pub text: String,
pub sections: Vec<Section>,
pub tables: Vec<Table>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Section {
pub heading: Option<String>,
pub content: String,
pub level: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Table {
pub caption: Option<String>,
pub headers: Vec<String>,
pub rows: Vec<Vec<String>>,
}
impl Table {
pub fn to_markdown(&self) -> String {
let mut md = String::new();
if let Some(caption) = &self.caption {
md.push_str(&format!("**{}**\n\n", caption));
}
if !self.headers.is_empty() {
md.push_str("| ");
md.push_str(&self.headers.join(" | "));
md.push_str(" |\n");
md.push_str("| ");
md.push_str(
&self
.headers
.iter()
.map(|_| "---")
.collect::<Vec<_>>()
.join(" | "),
);
md.push_str(" |\n");
}
for row in &self.rows {
md.push_str("| ");
md.push_str(&row.join(" | "));
md.push_str(" |\n");
}
md
}
}
pub struct PdfExtractor {
config: PdfConfig,
}
impl PdfExtractor {
pub fn new(config: PdfConfig) -> Self {
Self { config }
}
pub fn extract_file(&self, path: &std::path::Path) -> Result<ExtractedDocument> {
let bytes = std::fs::read(path)?;
self.extract_bytes(&bytes)
}
pub fn extract_bytes(&self, bytes: &[u8]) -> Result<ExtractedDocument> {
let text = pdf_extract::extract_text_from_mem(bytes)?;
let doc = lopdf::Document::load_mem(bytes)?;
let metadata = self.extract_metadata(&doc);
let pages = self.split_into_pages(&text, metadata.page_count);
Ok(ExtractedDocument {
metadata,
pages,
full_text: text,
})
}
fn extract_metadata(&self, doc: &lopdf::Document) -> DocumentMetadata {
let mut metadata = DocumentMetadata::default();
metadata.page_count = doc.get_pages().len();
if let Ok(info) = doc.trailer.get(b"Info") {
if let Ok(info_ref) = info.as_reference() {
if let Ok(info_dict) = doc.get_dictionary(info_ref) {
metadata.title = self.get_string(info_dict, b"Title");
metadata.author = self.get_string(info_dict, b"Author");
metadata.subject = self.get_string(info_dict, b"Subject");
metadata.creator = self.get_string(info_dict, b"Creator");
metadata.producer = self.get_string(info_dict, b"Producer");
}
}
}
metadata
}
fn get_string(&self, dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
dict.get(key)
.ok()
.and_then(|v| v.as_str().ok())
.and_then(|s| String::from_utf8(s.to_vec()).ok())
}
fn split_into_pages(&self, text: &str, page_count: usize) -> Vec<ExtractedPage> {
let chunks: Vec<&str> = if text.contains('\x0C') {
text.split('\x0C').collect()
} else if page_count > 0 {
let chars_per_page = text.len() / page_count;
text.as_bytes()
.chunks(chars_per_page)
.map(|c| std::str::from_utf8(c).unwrap_or(""))
.collect()
} else {
vec![text]
};
chunks
.into_iter()
.enumerate()
.map(|(i, content)| ExtractedPage {
page_number: i + 1,
text: content.to_string(),
sections: Vec::new(), tables: Vec::new(), })
.collect()
}
}
impl Default for PdfExtractor {
fn default() -> Self {
Self::new(PdfConfig::default())
}
}
pub fn chunk_for_rag(
doc: &ExtractedDocument,
chunk_size: usize,
overlap: usize,
) -> Vec<DocumentChunk> {
let mut chunks = Vec::new();
let text = &doc.full_text;
let mut start = 0;
while start < text.len() {
let end = (start + chunk_size).min(text.len());
let actual_end = if end < text.len() {
text[start..end]
.rfind(|c| c == '.' || c == '!' || c == '?')
.map(|i| start + i + 1)
.unwrap_or(end)
} else {
end
};
chunks.push(DocumentChunk {
content: text[start..actual_end].to_string(),
page_start: 1, page_end: 1,
chunk_index: chunks.len(),
metadata: doc.metadata.clone(),
});
start = if overlap > 0 && actual_end > overlap {
actual_end - overlap
} else {
actual_end
};
}
chunks
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChunk {
pub content: String,
pub page_start: usize,
pub page_end: usize,
pub chunk_index: usize,
pub metadata: DocumentMetadata,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_default() {
let config = PdfConfig::default();
assert!(config.preserve_layout);
assert!(config.extract_metadata);
}
#[test]
fn test_table_to_markdown() {
let table = Table {
caption: Some("Test Table".to_string()),
headers: vec!["Col1".to_string(), "Col2".to_string()],
rows: vec![vec!["A".to_string(), "B".to_string()]],
};
let md = table.to_markdown();
assert!(md.contains("Test Table"));
assert!(md.contains("Col1"));
assert!(md.contains("| A | B |"));
}
}