use super::FileReader;
use anyhow::{Context, Result};
use std::io::Read;
#[cfg(feature = "pdf")]
extern crate pdf_extract;
pub struct PdfReader;
impl PdfReader {
pub fn new() -> Self {
Self
}
fn extract_text_from_bytes_internal(&self, bytes: &[u8]) -> Result<String> {
#[cfg(feature = "pdf")]
{
match pdf_extract::extract_text_from_mem(bytes) {
Ok(text) => {
let cleaned_text = text.trim();
if cleaned_text.is_empty() {
return Ok("[image page]".to_string());
}
let formatted_text = text.replace('\x0C', "\u{000C}");
Ok(formatted_text)
}
Err(e) => {
let error_msg = e.to_string().to_lowercase();
if error_msg.contains("encrypt")
|| error_msg.contains("password")
|| error_msg.contains("security")
{
Err(anyhow::anyhow!(
"PDF appears to be encrypted and requires a password for text extraction. \
Error: {}", e
))
} else {
if error_msg.contains("no text")
|| error_msg.contains("image")
|| error_msg.contains("scan")
{
Ok("[image page]".to_string())
} else {
Err(anyhow::anyhow!("Failed to extract text from PDF: {}", e))
}
}
}
}
}
#[cfg(not(feature = "pdf"))]
{
let _ = bytes; Err(anyhow::anyhow!(
"PDF support is not enabled. Please compile with the 'pdf' feature flag to enable PDF processing."
))
}
}
}
impl FileReader for PdfReader {
fn read_as_text(&self, file_path: &str) -> Result<String> {
let bytes = std::fs::read(file_path)
.with_context(|| format!("Failed to read PDF file: {}", file_path))?;
self.read_as_text_from_bytes(&bytes)
.with_context(|| format!("Failed to extract text from PDF file: {}", file_path))
}
fn read_as_text_from_bytes(&self, bytes: &[u8]) -> Result<String> {
self.extract_text_from_bytes_internal(bytes)
}
fn read_as_text_from_reader(&self, mut reader: Box<dyn Read>) -> Result<String> {
let mut bytes = Vec::new();
reader
.read_to_end(&mut bytes)
.with_context(|| "Failed to read bytes from reader")?;
self.read_as_text_from_bytes(&bytes)
}
fn can_handle(&self, extension: &str) -> bool {
extension.to_lowercase() == "pdf"
}
}