#[cfg(feature = "content-processing")]
use crate::content_processing::{
ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
ExtractedImage, ExtractedLink, ExtractedTable, FormatHandler, Heading, ProcessingStats,
TocEntry,
};
#[cfg(feature = "content-processing")]
use anyhow::{anyhow, Result};
#[cfg(feature = "content-processing")]
use std::collections::HashMap;
#[cfg(feature = "content-processing")]
pub struct PdfHandler;
#[cfg(feature = "content-processing")]
impl FormatHandler for PdfHandler {
fn extract_content(
&self,
data: &[u8],
config: &ContentExtractionConfig,
) -> Result<ExtractedContent> {
let text = match pdf_extract::extract_text_from_mem(data) {
Ok(extracted_text) => {
if extracted_text.trim().is_empty() {
return Err(anyhow!("No text content found in PDF"));
}
extracted_text
}
Err(e) => {
return Err(anyhow!("Failed to extract text from PDF: {}", e));
}
};
let mut metadata = HashMap::new();
metadata.insert("format".to_string(), "PDF".to_string());
metadata.insert("size".to_string(), data.len().to_string());
metadata.insert("extraction_method".to_string(), "pdf-extract".to_string());
if let Some(pdf_metadata) = self.extract_pdf_metadata(data) {
for (key, value) in pdf_metadata {
metadata.insert(key, value);
}
}
let estimated_pages = text.matches("\x0C").count().max(1);
let headings = self.extract_pdf_headings(&text);
let tables = if config.extract_tables {
self.extract_pdf_tables(&text)
} else {
Vec::new()
};
let links = if config.extract_links {
self.extract_pdf_links(&text)
} else {
Vec::new()
};
let toc = self.generate_table_of_contents(&headings);
let images = if config.extract_images {
self.extract_pdf_images(data, config).unwrap_or_default()
} else {
Vec::new()
};
Ok(ExtractedContent {
format: DocumentFormat::Pdf,
text: text.trim().to_string(),
metadata,
images,
tables,
links,
structure: DocumentStructure {
title: self.extract_pdf_title(&text),
headings: headings.clone(),
page_count: estimated_pages,
section_count: headings.len().max(1),
table_of_contents: toc,
},
chunks: Vec::new(),
language: None,
processing_stats: ProcessingStats::default(),
audio_content: Vec::new(),
video_content: Vec::new(),
cross_modal_embeddings: Vec::new(),
})
}
fn can_handle(&self, data: &[u8]) -> bool {
data.len() >= 4 && &data[0..4] == b"%PDF"
}
fn supported_extensions(&self) -> Vec<&'static str> {
vec!["pdf"]
}
}
#[cfg(feature = "content-processing")]
impl PdfHandler {
fn extract_pdf_title(&self, text: &str) -> Option<String> {
let lines: Vec<&str> = text.lines().take(10).collect();
for line in lines {
let trimmed = line.trim();
if trimmed.len() > 5
&& trimmed.len() < 100
&& !trimmed.contains("http")
&& !trimmed.contains("www")
{
return Some(trimmed.to_string());
}
}
None
}
fn extract_pdf_headings(&self, text: &str) -> Vec<Heading> {
let mut headings = Vec::new();
let lines: Vec<&str> = text.lines().collect();
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim();
if trimmed.len() > 5 && trimmed.len() < 80 {
let words: Vec<&str> = trimmed.split_whitespace().collect();
let capitalized_words = words
.iter()
.filter(|w| w.chars().next().is_some_and(|c| c.is_uppercase()))
.count();
if capitalized_words >= words.len() / 2 && words.len() <= 10 {
headings.push(Heading {
level: 1, text: trimmed.to_string(),
location: ContentLocation {
page: None,
section: None,
char_offset: None,
line: Some(i + 1),
column: None,
},
});
}
}
}
headings
}
fn extract_pdf_tables(&self, text: &str) -> Vec<ExtractedTable> {
let mut tables = Vec::new();
let lines: Vec<&str> = text.lines().collect();
let mut current_table: Vec<Vec<String>> = Vec::new();
let mut in_table = false;
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim();
let parts: Vec<&str> = trimmed.split_whitespace().collect();
if parts.len() >= 2 && parts.len() <= 8 {
let has_numbers = parts.iter().any(|p| p.parse::<f64>().is_ok());
let consistent_spacing =
trimmed.contains('\t') || trimmed.matches(" ").count() >= 2;
if has_numbers || consistent_spacing {
if !in_table {
in_table = true;
current_table.clear();
}
let row: Vec<String> = parts.iter().map(|s| s.to_string()).collect();
current_table.push(row);
} else if in_table && current_table.len() >= 2 {
tables.push(ExtractedTable {
headers: if current_table.len() > 1 {
current_table[0].clone()
} else {
Vec::new()
},
rows: current_table[1..].to_vec(),
caption: None,
location: ContentLocation {
page: None,
section: None,
char_offset: None,
line: Some(i + 1),
column: None,
},
});
in_table = false;
current_table.clear();
}
} else if in_table {
if current_table.len() >= 2 {
tables.push(ExtractedTable {
headers: if current_table.len() > 1 {
current_table[0].clone()
} else {
Vec::new()
},
rows: current_table[1..].to_vec(),
caption: None,
location: ContentLocation {
page: None,
section: None,
char_offset: None,
line: Some(i + 1),
column: None,
},
});
}
in_table = false;
current_table.clear();
}
}
if in_table && current_table.len() >= 2 {
tables.push(ExtractedTable {
headers: if current_table.len() > 1 {
current_table[0].clone()
} else {
Vec::new()
},
rows: current_table[1..].to_vec(),
caption: None,
location: ContentLocation {
page: None,
section: None,
char_offset: None,
line: Some(lines.len()),
column: None,
},
});
}
tables
}
fn extract_pdf_links(&self, text: &str) -> Vec<ExtractedLink> {
let mut links = Vec::new();
let url_regex =
regex::Regex::new(r"https?://[^\s\)]+").expect("URL regex pattern is valid");
let email_regex = regex::Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
.expect("email regex pattern is valid");
for mat in url_regex.find_iter(text) {
let url = mat
.as_str()
.trim_end_matches(&['.', ',', ')', ']', '}'][..]);
links.push(ExtractedLink {
url: url.to_string(),
text: url.to_string(),
title: None,
location: ContentLocation {
page: None,
section: None,
char_offset: None,
line: None,
column: None,
},
});
}
for mat in email_regex.find_iter(text) {
let email = mat.as_str();
links.push(ExtractedLink {
url: format!("mailto:{}", email),
text: email.to_string(),
title: None,
location: ContentLocation {
page: None,
section: None,
char_offset: None,
line: None,
column: None,
},
});
}
links
}
fn extract_pdf_metadata(&self, data: &[u8]) -> Option<HashMap<String, String>> {
let mut metadata = HashMap::new();
let content = String::from_utf8_lossy(data).into_owned();
if let Some(title_match) = regex::Regex::new(r"/Title\s*\(\s*([^)]+)\s*\)")
.expect("title regex pattern is valid")
.captures(&content)
{
if let Some(title) = title_match.get(1) {
metadata.insert("title".to_string(), title.as_str().to_string());
}
}
if let Some(author_match) = regex::Regex::new(r"/Author\s*\(\s*([^)]+)\s*\)")
.expect("author regex pattern is valid")
.captures(&content)
{
if let Some(author) = author_match.get(1) {
metadata.insert("author".to_string(), author.as_str().to_string());
}
}
if let Some(subject_match) = regex::Regex::new(r"/Subject\s*\(\s*([^)]+)\s*\)")
.expect("subject regex pattern is valid")
.captures(&content)
{
if let Some(subject) = subject_match.get(1) {
metadata.insert("subject".to_string(), subject.as_str().to_string());
}
}
if let Some(date_match) = regex::Regex::new(r"/CreationDate\s*\(\s*([^)]+)\s*\)")
.expect("creation date regex pattern is valid")
.captures(&content)
{
if let Some(date) = date_match.get(1) {
metadata.insert("creation_date".to_string(), date.as_str().to_string());
}
}
if metadata.is_empty() {
None
} else {
Some(metadata)
}
}
fn extract_pdf_images(
&self,
_data: &[u8],
config: &ContentExtractionConfig,
) -> Result<Vec<ExtractedImage>> {
if config.extract_images {
Ok(Vec::new())
} else {
Ok(Vec::new())
}
}
fn generate_table_of_contents(&self, headings: &[Heading]) -> Vec<TocEntry> {
headings
.iter()
.map(|heading| TocEntry {
title: heading.text.clone(),
level: heading.level,
page: heading.location.page,
location: heading.location.clone(),
})
.collect()
}
}