use std::path::Path;
use lopdf::Document as LopdfDocument;
use tracing::{info, warn};
use crate::Error;
use crate::error::Result;
use crate::index::parse::toc::TocProcessor;
use crate::llm::LlmClient;
use super::types::{PdfMetadata, PdfPage, PdfParseResult};
use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode};
pub struct PdfParser {
config: PdfParserConfig,
llm_client: Option<LlmClient>,
}
#[derive(Debug, Clone)]
pub struct PdfParserConfig {
pub max_pages: usize,
pub extract_toc: bool,
}
impl Default for PdfParserConfig {
fn default() -> Self {
Self {
max_pages: 0,
extract_toc: true,
}
}
}
impl PdfParser {
pub fn new() -> Self {
Self::default()
}
pub fn with_llm_client(client: LlmClient) -> Self {
Self {
config: PdfParserConfig::default(),
llm_client: Some(client),
}
}
pub fn with_config(config: PdfParserConfig) -> Self {
Self {
config,
llm_client: None,
}
}
pub fn without_toc() -> Self {
Self {
config: PdfParserConfig {
extract_toc: false,
..Default::default()
},
llm_client: None,
}
}
pub async fn parse_bytes_raw(
&self,
bytes: &[u8],
filename: Option<&str>,
) -> Result<PdfParseResult> {
let pages = self.extract_pages(bytes)?;
let metadata = match LopdfDocument::load_mem(bytes) {
Ok(doc) => self.extract_metadata(&doc, filename),
Err(_) => PdfMetadata {
title: filename.unwrap_or("Document").to_string(),
page_count: pages.len(),
..Default::default()
},
};
Ok(PdfParseResult::new(metadata, pages))
}
fn extract_pages(&self, bytes: &[u8]) -> Result<Vec<PdfPage>> {
let page_texts = pdf_extract::extract_text_from_mem_by_pages(bytes)
.map_err(|e| Error::Parse(format!("pdf-extract failed: {}", e)))?;
let mut pages = Vec::new();
for (i, text) in page_texts.iter().enumerate() {
if self.config.max_pages > 0 && i >= self.config.max_pages {
break;
}
let page_num = i + 1; if !text.trim().is_empty() {
pages.push(PdfPage::new(page_num, text.clone()));
}
}
Ok(pages)
}
fn extract_metadata(&self, doc: &LopdfDocument, filename: Option<&str>) -> PdfMetadata {
let mut metadata = PdfMetadata {
title: filename.unwrap_or("Document").to_string(),
page_count: doc.get_pages().len(),
..Default::default()
};
if let Ok(info) = doc.trailer.get(b"Info") {
if let Ok(info_ref) = info.as_reference() {
if let Ok(info_obj) = doc.get_object(info_ref) {
if let Ok(dict) = info_obj.as_dict() {
if let Ok(title_obj) = dict.get(b"Title") {
if let Ok(title) = title_obj.as_str() {
metadata.title = self.decode_pdf_string(title);
}
}
if let Ok(author_obj) = dict.get(b"Author") {
if let Ok(author) = author_obj.as_str() {
metadata.author = Some(self.decode_pdf_string(author));
}
}
if let Ok(subject_obj) = dict.get(b"Subject") {
if let Ok(subject) = subject_obj.as_str() {
metadata.subject = Some(self.decode_pdf_string(subject));
}
}
}
}
}
}
metadata
}
fn decode_pdf_string(&self, bytes: &[u8]) -> String {
let mut result = String::new();
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b'\\' if i + 1 < bytes.len() => {
i += 1;
match bytes[i] {
b'n' => result.push('\n'),
b'r' => result.push('\r'),
b't' => result.push('\t'),
b'(' => result.push('('),
b')' => result.push(')'),
b'\\' => result.push('\\'),
_ => {}
}
}
b if b >= 32 && b < 127 => {
result.push(b as char);
}
_ => {}
}
i += 1;
}
result
}
fn toc_entries_to_raw_nodes(
&self,
entries: &[crate::index::parse::toc::TocEntry],
pages: &[PdfPage],
) -> Vec<RawNode> {
let mut nodes = Vec::new();
for entry in entries {
let content = self.get_content_for_entry(entry, pages);
let mut node = RawNode::new(&entry.title)
.with_content(content)
.with_level(entry.level);
if let Some(page) = entry.physical_page {
node = node.with_page(page);
}
nodes.push(node);
}
nodes
}
fn get_content_for_entry(
&self,
entry: &crate::index::parse::toc::TocEntry,
pages: &[PdfPage],
) -> String {
let start_page = entry.physical_page.unwrap_or(1);
pages
.iter()
.find(|p| p.number == start_page)
.map(|p| {
let text = &p.text;
if let Some(pos) = text.find(&entry.title) {
text[pos + entry.title.len()..].trim().to_string()
} else {
text.clone()
}
})
.unwrap_or_default()
}
fn pages_to_raw_nodes(&self, pages: &[PdfPage]) -> Vec<RawNode> {
pages
.iter()
.map(|page| {
RawNode::new(format!("Page {}", page.number))
.with_content(page.text.clone())
.with_level(1)
.with_page(page.number)
})
.collect()
}
}
impl Default for PdfParser {
fn default() -> Self {
Self::with_config(PdfParserConfig::default())
}
}
impl PdfParser {
pub async fn parse_file(&self, path: &Path) -> Result<ParseResult> {
let bytes = tokio::fs::read(path)
.await
.map_err(|e| Error::Parse(format!("Failed to read PDF file: {}", e)))?;
let filename = path.file_stem().and_then(|s| s.to_str());
self.parse_bytes_to_result(&bytes, filename, Some(path))
.await
}
pub async fn parse_bytes_async(
&self,
bytes: &[u8],
filename: Option<&str>,
) -> Result<ParseResult> {
self.parse_bytes_to_result(bytes, filename, None).await
}
async fn parse_bytes_to_result(
&self,
bytes: &[u8],
filename: Option<&str>,
source_path: Option<&Path>,
) -> Result<ParseResult> {
let result = self.parse_bytes_raw(bytes, filename).await?;
let page_count = result.pages.len();
let nodes = if self.config.extract_toc {
info!("Extracting TOC from PDF with {} pages", page_count);
let processor = match &self.llm_client {
Some(client) => {
info!("PdfParser: creating TocProcessor with LLM client");
TocProcessor::with_llm_client(client.clone())
}
None => {
info!(
"PdfParser: creating TocProcessor without LLM client (no key configured)"
);
TocProcessor::new()
}
};
match processor.process(&result.pages).await {
Ok(entries) if !entries.is_empty() => {
info!("Extracted {} TOC entries", entries.len());
self.toc_entries_to_raw_nodes(&entries, &result.pages)
}
Ok(_) => {
warn!("No TOC entries found, falling back to page-based extraction");
self.pages_to_raw_nodes(&result.pages)
}
Err(e) => {
warn!(
"TOC extraction failed: {}, falling back to page-based extraction",
e
);
self.pages_to_raw_nodes(&result.pages)
}
}
} else {
self.pages_to_raw_nodes(&result.pages)
};
let meta = DocumentMeta {
name: result.metadata.title,
format: DocumentFormat::Pdf,
page_count: Some(page_count),
line_count: 0,
source_path: source_path.map(|p| p.to_string_lossy().to_string()),
description: result.metadata.subject,
};
Ok(ParseResult::new(meta, nodes))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parser_creation() {
let parser = PdfParser::new();
assert_eq!(parser.config.max_pages, 0);
assert!(parser.config.extract_toc);
}
#[test]
fn test_parser_without_toc() {
let parser = PdfParser::without_toc();
assert!(!parser.config.extract_toc);
}
#[test]
fn test_decode_pdf_string() {
let parser = PdfParser::new();
let decoded = parser.decode_pdf_string(b"Hello World");
assert_eq!(decoded, "Hello World");
let decoded = parser.decode_pdf_string(b"Hello\\nWorld");
assert_eq!(decoded, "Hello\nWorld");
}
}