use crate::{Document, DocumentType, Error, Metadata, Result, Source, SourceType};
use chrono::Utc;
use lopdf::Document as PdfDocument;
use std::path::Path;
use tracing::{debug, info, warn};
pub struct PdfIngester {
extract_metadata: bool,
}
impl PdfIngester {
pub fn new() -> Self {
Self {
extract_metadata: true,
}
}
pub fn ingest(&self, path: &Path) -> Result<Document> {
info!("Ingesting PDF: {:?}", path);
let pdf_doc = PdfDocument::load(path)
.map_err(|e| Error::pdf(format!("Failed to load PDF: {}", e)))?;
let mut full_text = String::new();
let page_count = pdf_doc.get_pages().len();
debug!("PDF has {} pages", page_count);
for (page_num, _) in pdf_doc.get_pages() {
match self.extract_page_text(&pdf_doc, page_num) {
Ok(text) => {
if !text.is_empty() {
full_text.push_str(&text);
full_text.push('\n');
}
}
Err(e) => {
warn!("Failed to extract text from page {}: {}", page_num, e);
}
}
}
let cleaned_text = self.clean_text(&full_text);
let metadata = if self.extract_metadata {
self.extract_metadata(&pdf_doc, path)
} else {
Metadata::default()
};
let source_type = self.detect_source_type(path);
let arxiv_id = self.extract_arxiv_id(path);
let source = Source {
source_type,
url: None,
path: Some(path.to_string_lossy().to_string()),
arxiv_id,
github_repo: None,
retrieved_at: Utc::now(),
version: None,
};
let mut doc = Document::new(DocumentType::Paper, source).with_content(cleaned_text);
doc.metadata = metadata;
info!(
"Extracted {} chars from {} pages",
doc.content.char_count, page_count
);
Ok(doc)
}
fn extract_page_text(&self, doc: &PdfDocument, page_num: u32) -> Result<String> {
let page_id = doc
.page_iter()
.nth((page_num - 1) as usize)
.ok_or_else(|| Error::pdf(format!("Page {} not found", page_num)))?;
let content = doc
.get_page_content(page_id)
.map_err(|e| Error::pdf(format!("Failed to get page content: {}", e)))?;
let text = self.parse_content_stream(&content, doc);
Ok(text)
}
fn parse_content_stream(&self, content: &[u8], _doc: &PdfDocument) -> String {
let mut text = String::new();
let content_str = String::from_utf8_lossy(content);
let mut in_text = false;
let mut current_text = String::new();
for line in content_str.lines() {
let line = line.trim();
if line == "BT" {
in_text = true;
continue;
}
if line == "ET" {
if !current_text.is_empty() {
text.push_str(¤t_text);
text.push(' ');
current_text.clear();
}
in_text = false;
continue;
}
if in_text {
if let Some(text_content) = self.extract_text_from_operator(line) {
current_text.push_str(&text_content);
}
}
}
text
}
fn extract_text_from_operator(&self, line: &str) -> Option<String> {
let line = line.trim();
if line.ends_with("Tj") {
if let Some(start) = line.find('(') {
if let Some(end) = line.rfind(')') {
let text = &line[start + 1..end];
return Some(self.decode_pdf_string(text));
}
}
}
if line.ends_with("TJ") {
let mut result = String::new();
let mut in_string = false;
let mut current = String::new();
for c in line.chars() {
match c {
'(' => {
in_string = true;
current.clear();
}
')' => {
if in_string {
result.push_str(&self.decode_pdf_string(¤t));
in_string = false;
}
}
_ if in_string => {
current.push(c);
}
_ => {}
}
}
if !result.is_empty() {
return Some(result);
}
}
None
}
fn decode_pdf_string(&self, s: &str) -> String {
let mut result = String::new();
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '\\' {
match chars.next() {
Some('n') => result.push('\n'),
Some('r') => result.push('\r'),
Some('t') => result.push('\t'),
Some('\\') => result.push('\\'),
Some('(') => result.push('('),
Some(')') => result.push(')'),
Some(d) if d.is_ascii_digit() => {
let mut octal = String::from(d);
while octal.len() < 3 {
if let Some(&next) = chars.peek() {
if next.is_ascii_digit() {
octal.push(chars.next().unwrap());
} else {
break;
}
} else {
break;
}
}
if let Ok(code) = u8::from_str_radix(&octal, 8) {
result.push(code as char);
}
}
Some(other) => result.push(other),
None => {}
}
} else {
result.push(c);
}
}
result
}
fn clean_text(&self, text: &str) -> String {
let mut cleaned = String::new();
let mut prev_was_space = false;
for c in text.chars() {
if c.is_whitespace() {
if !prev_was_space {
cleaned.push(' ');
prev_was_space = true;
}
} else {
cleaned.push(c);
prev_was_space = false;
}
}
cleaned = cleaned.replace("\u{0000}", "");
cleaned = cleaned.replace("\u{FEFF}", "");
cleaned.trim().to_string()
}
fn extract_metadata(&self, doc: &PdfDocument, path: &Path) -> Metadata {
let mut metadata = Metadata::default();
let pdf_to_string = |obj: &lopdf::Object| -> Option<String> {
match obj {
lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
lopdf::Object::Name(bytes) => String::from_utf8(bytes.clone()).ok(),
_ => None,
}
};
if let Ok(info) = doc.trailer.get(b"Info") {
if let Ok(info_ref) = info.as_reference() {
if let Ok(info_dict) = doc.get_dictionary(info_ref) {
if let Ok(title) = info_dict.get(b"Title") {
metadata.title = pdf_to_string(title);
}
if let Ok(author) = info_dict.get(b"Author") {
if let Some(author_str) = pdf_to_string(author) {
metadata.authors.push(crate::Author {
name: author_str,
affiliation: None,
email: None,
});
}
}
if let Ok(subject) = info_dict.get(b"Subject") {
if let Some(abstract_text) = pdf_to_string(subject) {
metadata.abstract_text = Some(abstract_text);
}
}
if let Ok(keywords) = info_dict.get(b"Keywords") {
if let Some(keywords_str) = pdf_to_string(keywords) {
metadata.tags = keywords_str
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
}
}
}
}
if metadata.title.is_none() {
metadata.title = path
.file_stem()
.and_then(|s| s.to_str())
.map(|s| s.replace('_', " "));
}
metadata
}
fn detect_source_type(&self, path: &Path) -> SourceType {
let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
if filename.contains("arxiv") || filename.starts_with("2") {
SourceType::Arxiv
} else {
SourceType::Local
}
}
fn extract_arxiv_id(&self, path: &Path) -> Option<String> {
let filename = path.file_stem().and_then(|s| s.to_str())?;
let re = regex::Regex::new(r"(\d{4}\.\d{4,5})").ok()?;
re.captures(filename)
.and_then(|caps| caps.get(1))
.map(|m| m.as_str().to_string())
}
}
impl Default for PdfIngester {
fn default() -> Self {
Self::new()
}
}
impl super::Ingester for PdfIngester {
fn ingest(&self, path: &Path) -> Result<Document> {
PdfIngester::ingest(self, path)
}
fn can_handle(&self, path: &Path) -> bool {
path.extension()
.and_then(|e| e.to_str())
.map(|s| s.to_lowercase() == "pdf")
.unwrap_or(false)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_pdf_string() {
let ingester = PdfIngester::new();
assert_eq!(ingester.decode_pdf_string("hello"), "hello");
assert_eq!(ingester.decode_pdf_string("hello\\nworld"), "hello\nworld");
assert_eq!(ingester.decode_pdf_string("test\\(paren\\)"), "test(paren)");
}
#[test]
fn test_extract_arxiv_id() {
let ingester = PdfIngester::new();
let path = Path::new("/data/papers/arxiv_2401.18059.pdf");
assert_eq!(
ingester.extract_arxiv_id(path),
Some("2401.18059".to_string())
);
let path = Path::new("/data/papers/cot_2201.11903.pdf");
assert_eq!(
ingester.extract_arxiv_id(path),
Some("2201.11903".to_string())
);
let path = Path::new("/data/papers/random_paper.pdf");
assert_eq!(ingester.extract_arxiv_id(path), None);
}
#[test]
fn test_clean_text() {
let ingester = PdfIngester::new();
let dirty = " hello world \n\n test ";
assert_eq!(ingester.clean_text(dirty), "hello world test");
}
}