use crate::core::{Content, Error, ExtractionResult, Metadata, MetadataValue, Result};
use crate::parsers::Parser;
use flate2::read::ZlibDecoder;
use lopdf::Document;
use std::io::Read;
pub struct PdfParser;
impl Parser for PdfParser {
fn supported_types(&self) -> &[&str] {
&["application/pdf"]
}
fn parse(&self, data: &[u8], mime_type: &str) -> Result<ExtractionResult> {
match Document::load_mem(data) {
Ok(doc) => return finalize_from_doc(doc, mime_type, "strict", false),
Err(strict_err) => {
if let Some(truncated) = repair_trailing_junk(data) {
if let Ok(doc) = Document::load_mem(&truncated) {
return finalize_from_doc(doc, mime_type, "repaired_xref", true);
}
}
let scanned = raw_scan_text(data);
let scanned_ok =
!scanned.trim().is_empty() && looks_like_text(&scanned);
if !scanned_ok {
#[cfg(feature = "pdf-extract")]
if let Some(extracted) = try_pdf_extract(data) {
if looks_like_text(&extracted) {
let mut metadata = Metadata::new();
metadata.insert(
"pdf_parse_strategy".to_string(),
MetadataValue::Text("pdf_extract".to_string()),
);
metadata.insert(
"pdf_parse_partial".to_string(),
MetadataValue::Boolean(true),
);
metadata.insert(
"pdf_parse_error".to_string(),
MetadataValue::Text(strict_err.to_string()),
);
if let Some(version) = scan_pdf_version(data) {
metadata.insert(
"pdf_version".to_string(),
MetadataValue::Text(version),
);
}
return Ok(ExtractionResult {
mime_type: mime_type.to_string(),
content: Content::Text(extracted),
metadata,
detection_confidence: 1.0,
});
}
}
return Err(Error::ParseError(format!(
"Failed to load PDF: {strict_err}. Tried lenient \
fallback but recovered text was unreadable (font \
CMap, encryption, or non-FlateDecode compression). \
{extract_hint}",
extract_hint = pdf_extract_hint(),
)));
}
let mut metadata = Metadata::new();
metadata.insert(
"pdf_parse_strategy".to_string(),
MetadataValue::Text("raw_scan".to_string()),
);
metadata.insert(
"pdf_parse_partial".to_string(),
MetadataValue::Boolean(true),
);
metadata.insert(
"pdf_parse_error".to_string(),
MetadataValue::Text(strict_err.to_string()),
);
if let Some(version) = scan_pdf_version(data) {
metadata.insert(
"pdf_version".to_string(),
MetadataValue::Text(version),
);
}
Ok(ExtractionResult {
mime_type: mime_type.to_string(),
content: Content::Text(scanned),
metadata,
detection_confidence: 1.0,
})
}
}
}
fn name(&self) -> &str {
"PdfParser"
}
}
fn finalize_from_doc(
doc: Document,
mime_type: &str,
strategy: &str,
partial: bool,
) -> Result<ExtractionResult> {
let text = extract_text(&doc)?;
let mut metadata = extract_metadata(&doc)?;
let final_text = if text.trim().is_empty() {
let (ocr_text, ocr_info) = maybe_ocr_pdf_images(&doc);
if let Some(info) = ocr_info {
for (k, v) in info {
metadata.insert(k, v);
}
}
ocr_text.unwrap_or(text)
} else {
text
};
metadata.insert(
"pdf_parse_strategy".to_string(),
MetadataValue::Text(strategy.to_string()),
);
if partial {
metadata.insert(
"pdf_parse_partial".to_string(),
MetadataValue::Boolean(true),
);
}
Ok(ExtractionResult {
mime_type: mime_type.to_string(),
content: Content::Text(final_text),
metadata,
detection_confidence: 1.0,
})
}
#[cfg(feature = "pdf-extract")]
fn pdf_extract_hint() -> &'static str {
"(pdf-extract fallback was tried and also failed.)"
}
#[cfg(not(feature = "pdf-extract"))]
fn pdf_extract_hint() -> &'static str {
"Rebuild with --features pdf-extract for an extra linearized-PDF / \
CMap-aware fallback parser."
}
#[cfg(feature = "pdf-extract")]
fn try_pdf_extract(data: &[u8]) -> Option<String> {
let raw = pdf_extract::extract_text_from_mem(data).ok()?;
let normalized = raw.replace('\x0c', "\n\n");
let trimmed = normalized.trim().to_string();
if trimmed.is_empty() {
None
} else {
Some(trimmed)
}
}
fn looks_like_text(s: &str) -> bool {
let mut total = 0usize;
let mut printable = 0usize;
let mut run = 0usize;
let mut max_run = 0usize;
for c in s.chars() {
total += 1;
let is_printable = c.is_ascii_graphic()
|| matches!(c, ' ' | '\t' | '\n' | '\r')
|| c.is_alphabetic();
if is_printable {
printable += 1;
}
if c.is_ascii_alphanumeric() {
run += 1;
if run > max_run {
max_run = run;
}
} else {
run = 0;
}
}
if total == 0 {
return false;
}
let ratio = (printable as f64) / (total as f64);
ratio >= 0.60 && max_run >= 4
}
fn scan_pdf_version(data: &[u8]) -> Option<String> {
const HEADER: &[u8] = b"%PDF-";
let header_pos = data.windows(HEADER.len()).position(|w| w == HEADER)?;
let rest = &data[header_pos + HEADER.len()..];
let end = rest
.iter()
.position(|b| b.is_ascii_whitespace())
.unwrap_or(rest.len().min(8));
let slice = &rest[..end];
let s = std::str::from_utf8(slice).ok()?;
if s.contains('.') && s.chars().all(|c| c.is_ascii_digit() || c == '.') {
Some(s.to_string())
} else {
None
}
}
fn repair_trailing_junk(data: &[u8]) -> Option<Vec<u8>> {
const NEEDLE: &[u8] = b"%%EOF";
let pos = (0..=data.len().saturating_sub(NEEDLE.len()))
.rev()
.find(|&i| data[i..].starts_with(NEEDLE))?;
let end = pos + NEEDLE.len();
let end = match data.get(end) {
Some(b'\r') if data.get(end + 1) == Some(&b'\n') => end + 2,
Some(b'\r') | Some(b'\n') => end + 1,
_ => end,
};
if end >= data.len() {
return None;
}
Some(data[..end].to_vec())
}
fn raw_scan_text(data: &[u8]) -> String {
let mut out = String::new();
for stream in iter_streams(data) {
let candidates = [
inflate_zlib(stream),
decode_lzw(stream),
decode_ascii85(stream),
Some(stream.to_vec()),
];
for decoded in candidates.into_iter().flatten() {
if has_text_operators(&decoded) {
scan_content_operators(&decoded, &mut out);
}
}
}
out
}
fn has_text_operators(data: &[u8]) -> bool {
find_subsequence(data, b"Tj").is_some() || find_subsequence(data, b"TJ").is_some()
}
fn decode_lzw(data: &[u8]) -> Option<Vec<u8>> {
use weezl::{decode::Decoder, BitOrder};
if data.is_empty() {
return None;
}
let mut decoder = Decoder::new(BitOrder::Msb, 8);
decoder.decode(data).ok()
}
fn decode_ascii85(data: &[u8]) -> Option<Vec<u8>> {
let s = std::str::from_utf8(data).ok()?;
let s = s.trim();
let s = s.strip_prefix("<~").unwrap_or(s);
let s = s.strip_suffix("~>").unwrap_or(s);
ascii85::decode(s).ok()
}
fn iter_streams<'a>(data: &'a [u8]) -> impl Iterator<Item = &'a [u8]> + 'a {
let mut cursor = 0usize;
std::iter::from_fn(move || {
loop {
let rest = data.get(cursor..)?;
let rel = find_subsequence(rest, b"stream")?;
cursor += rel + b"stream".len();
let payload_start = match data.get(cursor) {
Some(b'\r') if data.get(cursor + 1) == Some(&b'\n') => cursor + 2,
Some(b'\n') => cursor + 1,
_ => continue, };
let rest = data.get(payload_start..)?;
let end_rel = find_subsequence(rest, b"endstream")?;
let payload_end = payload_start + end_rel;
cursor = payload_end + b"endstream".len();
return Some(&data[payload_start..payload_end]);
}
})
}
fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() || haystack.len() < needle.len() {
return None;
}
(0..=haystack.len() - needle.len()).find(|&i| &haystack[i..i + needle.len()] == needle)
}
fn inflate_zlib(data: &[u8]) -> Option<Vec<u8>> {
let mut decoder = ZlibDecoder::new(data);
let mut out = Vec::new();
decoder.read_to_end(&mut out).ok()?;
Some(out)
}
fn scan_content_operators(data: &[u8], out: &mut String) {
let mut i = 0usize;
let n = data.len();
let mut emitted_text_this_run = false;
while i < n {
let b = data[i];
if b == b'%' {
while i < n && data[i] != b'\n' && data[i] != b'\r' {
i += 1;
}
continue;
}
if b == b'(' {
let (s, next) = read_literal_string(data, i);
i = next;
let op = peek_next_operator(data, i);
match op.as_deref() {
Some(name @ "Tj") | Some(name @ "'") | Some(name @ "\"") => {
out.push_str(&s);
if name != "Tj" {
out.push('\n');
}
emitted_text_this_run = true;
}
Some("TJ") => {
out.push_str(&s);
emitted_text_this_run = true;
}
_ => {
}
}
continue;
}
if b == b'[' {
let (joined, next) = read_tj_array(data, i);
i = next;
if let Some("TJ") = peek_next_operator(data, i).as_deref() {
out.push_str(&joined);
emitted_text_this_run = true;
}
continue;
}
if b == b'E'
&& data.get(i..i + 2) == Some(b"ET")
&& is_word_boundary(data, i, 2)
&& emitted_text_this_run
{
out.push('\n');
emitted_text_this_run = false;
i += 2;
continue;
}
i += 1;
}
}
fn read_literal_string(data: &[u8], start: usize) -> (String, usize) {
let mut s = String::new();
let mut i = start + 1; let mut depth = 1usize;
let n = data.len();
while i < n && depth > 0 {
let b = data[i];
if b == b'\\' {
i += 1;
if i >= n {
break;
}
match data[i] {
b'n' => s.push('\n'),
b'r' => s.push('\r'),
b't' => s.push('\t'),
b'b' => s.push('\x08'),
b'f' => s.push('\x0c'),
b'\\' => s.push('\\'),
b'(' => s.push('('),
b')' => s.push(')'),
b'\r' | b'\n' => { }
c @ b'0'..=b'7' => {
let mut val = (c - b'0') as u32;
let mut count = 1;
while count < 3 && i + 1 < n {
let nxt = data[i + 1];
if !(b'0'..=b'7').contains(&nxt) {
break;
}
val = val * 8 + (nxt - b'0') as u32;
i += 1;
count += 1;
}
if let Some(c) = char::from_u32(val) {
s.push(c);
}
}
other => s.push(other as char),
}
i += 1;
} else if b == b'(' {
depth += 1;
s.push('(');
i += 1;
} else if b == b')' {
depth -= 1;
if depth > 0 {
s.push(')');
}
i += 1;
} else {
s.push(b as char);
i += 1;
}
}
(s, i)
}
fn read_tj_array(data: &[u8], start: usize) -> (String, usize) {
let mut s = String::new();
let mut i = start + 1; let n = data.len();
while i < n {
match data[i] {
b']' => {
i += 1;
break;
}
b'(' => {
let (piece, next) = read_literal_string(data, i);
s.push_str(&piece);
i = next;
}
b'<' => {
while i < n && data[i] != b'>' {
i += 1;
}
if i < n {
i += 1;
}
}
_ => i += 1,
}
}
(s, i)
}
fn peek_next_operator(data: &[u8], start: usize) -> Option<String> {
let mut i = start;
let n = data.len();
while i < n {
let b = data[i];
if b.is_ascii_whitespace()
|| b.is_ascii_digit()
|| b == b'-'
|| b == b'+'
|| b == b'.'
{
i += 1;
} else {
break;
}
}
if i >= n {
return None;
}
let op_start = i;
while i < n {
let b = data[i];
if b.is_ascii_alphabetic() || b == b'\'' || b == b'"' {
i += 1;
} else {
break;
}
}
if i == op_start {
return None;
}
Some(std::str::from_utf8(&data[op_start..i]).ok()?.to_string())
}
fn is_word_boundary(data: &[u8], i: usize, len: usize) -> bool {
let before_ok = i == 0 || !data[i - 1].is_ascii_alphanumeric();
let after_ok = data
.get(i + len)
.map(|b| !b.is_ascii_alphanumeric())
.unwrap_or(true);
before_ok && after_ok
}
fn extract_text(doc: &Document) -> Result<String> {
let mut text = String::new();
let pages = doc.get_pages();
for (page_num, _) in pages.iter() {
match doc.extract_text(&[*page_num]) {
Ok(page_text) => {
if !text.is_empty() && !text.ends_with('\n') {
text.push('\n');
}
text.push_str(&page_text);
}
Err(e) => {
eprintln!("Warning: Failed to extract text from page {}: {}", page_num, e);
}
}
}
Ok(text)
}
fn extract_metadata(doc: &Document) -> Result<Metadata> {
let mut metadata = Metadata::new();
let pages = doc.get_pages();
metadata.insert("page_count".to_string(), MetadataValue::Number(pages.len() as i64));
metadata.insert(
"pdf_version".to_string(),
MetadataValue::Text(doc.version.clone()),
);
let encrypted = doc.trailer.get(b"Encrypt").is_ok();
metadata.insert("encrypted".to_string(), MetadataValue::Boolean(encrypted));
if let Some(info_dict) = resolve_info_dict(doc) {
for (pdf_key, out_key) in [
(&b"Title"[..], "title"),
(&b"Author"[..], "author"),
(&b"Subject"[..], "subject"),
(&b"Creator"[..], "creator"),
(&b"Producer"[..], "producer"),
(&b"CreationDate"[..], "creation_date"),
(&b"ModDate"[..], "modification_date"),
(&b"Keywords"[..], "keywords"),
] {
if let Ok(obj) = info_dict.get(pdf_key) {
if let Ok(s) = obj.as_string() {
metadata.insert(out_key.to_string(), MetadataValue::Text(s.to_string()));
}
}
}
}
if let Some(catalog) = resolve_catalog(doc) {
if let Ok(page_layout) = catalog.get(b"PageLayout") {
if let Ok(s) = page_layout.as_name_str() {
metadata.insert("page_layout".to_string(), MetadataValue::Text(s.to_string()));
}
}
if let Ok(page_mode) = catalog.get(b"PageMode") {
if let Ok(s) = page_mode.as_name_str() {
metadata.insert("page_mode".to_string(), MetadataValue::Text(s.to_string()));
}
}
let form_fields_count = catalog
.get(b"AcroForm")
.ok()
.and_then(|v| dereference(doc, v))
.and_then(|d| d.as_dict().ok())
.and_then(|d| d.get(b"Fields").ok())
.and_then(|v| dereference(doc, v))
.and_then(|v| v.as_array().ok())
.map(|a| a.len())
.unwrap_or(0);
metadata.insert(
"form_fields_count".to_string(),
MetadataValue::Number(form_fields_count as i64),
);
let attachments_count = catalog
.get(b"Names")
.ok()
.and_then(|v| dereference(doc, v))
.and_then(|d| d.as_dict().ok())
.and_then(|d| d.get(b"EmbeddedFiles").ok())
.and_then(|v| dereference(doc, v))
.and_then(|d| d.as_dict().ok())
.and_then(|d| d.get(b"Names").ok())
.and_then(|v| dereference(doc, v))
.and_then(|v| v.as_array().ok())
.map(|a| a.len() / 2)
.unwrap_or(0);
metadata.insert(
"attachments_count".to_string(),
MetadataValue::Number(attachments_count as i64),
);
}
let annotations_count: usize = pages
.values()
.filter_map(|oid| doc.get_object(*oid).ok())
.filter_map(|obj| obj.as_dict().ok())
.filter_map(|page_dict| page_dict.get(b"Annots").ok())
.filter_map(|v| dereference(doc, v))
.filter_map(|v| v.as_array().ok().map(|a| a.len()))
.sum();
metadata.insert(
"annotations_count".to_string(),
MetadataValue::Number(annotations_count as i64),
);
Ok(metadata)
}
fn resolve_info_dict(doc: &Document) -> Option<&lopdf::Dictionary> {
let info = doc.trailer.get(b"Info").ok()?;
dereference(doc, info)?.as_dict().ok()
}
fn resolve_catalog(doc: &Document) -> Option<&lopdf::Dictionary> {
let root = doc.trailer.get(b"Root").ok()?;
dereference(doc, root)?.as_dict().ok()
}
fn dereference<'a>(doc: &'a Document, obj: &'a lopdf::Object) -> Option<&'a lopdf::Object> {
match obj {
lopdf::Object::Reference(r) => doc.get_object(*r).ok(),
other => Some(other),
}
}
fn collect_embedded_images(doc: &Document) -> Vec<(Vec<u8>, &'static str)> {
let mut out = Vec::new();
let mut seen_ids: std::collections::HashSet<lopdf::ObjectId> = std::collections::HashSet::new();
for (_, page_id) in doc.get_pages() {
let Ok(page) = doc.get_object(page_id) else { continue };
let Ok(page_dict) = page.as_dict() else { continue };
let Some(resources) = page_dict
.get(b"Resources")
.ok()
.and_then(|v| dereference(doc, v))
.and_then(|v| v.as_dict().ok())
else {
continue;
};
let Some(xobjects) = resources
.get(b"XObject")
.ok()
.and_then(|v| dereference(doc, v))
.and_then(|v| v.as_dict().ok())
else {
continue;
};
for (_name, obj) in xobjects.iter() {
let id = match obj {
lopdf::Object::Reference(r) => *r,
_ => continue,
};
if !seen_ids.insert(id) {
continue;
}
let Ok(obj) = doc.get_object(id) else { continue };
let Ok(stream) = obj.as_stream() else { continue };
let subtype = stream
.dict
.get(b"Subtype")
.ok()
.and_then(|v| v.as_name_str().ok())
.unwrap_or("");
if subtype != "Image" {
continue;
}
let filters = stream.filters().unwrap_or_default();
let filter_name = filters.first().map(String::as_str).unwrap_or("");
match filter_name {
"DCTDecode" => {
out.push((stream.content.clone(), "image/jpeg"));
}
_ => {
}
}
}
}
out
}
fn maybe_ocr_pdf_images(
_doc: &Document,
) -> (Option<String>, Option<Vec<(String, MetadataValue)>>) {
#[cfg(feature = "ocr")]
{
if !crate::ocr::runtime_enabled() {
return (None, None);
}
let images = collect_embedded_images(_doc);
let total = images.len();
if images.is_empty() {
return (
None,
Some(vec![
(
"ocr_status".to_string(),
MetadataValue::Text("no_embedded_images".into()),
),
(
"ocr_applied".to_string(),
MetadataValue::Boolean(false),
),
]),
);
}
let mut text = String::new();
let mut recognized_count = 0usize;
let mut confidences: Vec<f32> = Vec::new();
for (i, (bytes, _hint)) in images.iter().enumerate() {
match crate::ocr::run_ocr(bytes) {
crate::ocr::OcrAttempt::Recognized {
text: t,
mean_confidence,
} => {
if !text.is_empty() {
text.push_str("\n\n");
}
text.push_str(&format!("[image {} of {}]\n", i + 1, total));
text.push_str(&t);
recognized_count += 1;
confidences.push(mean_confidence);
}
_ => {}
}
}
let mut info = vec![
(
"ocr_status".to_string(),
MetadataValue::Text(
if recognized_count > 0 {
"recognized"
} else {
"no_text_found"
}
.to_string(),
),
),
(
"ocr_applied".to_string(),
MetadataValue::Boolean(recognized_count > 0),
),
(
"ocr_images_total".to_string(),
MetadataValue::Number(total as i64),
),
(
"ocr_images_recognized".to_string(),
MetadataValue::Number(recognized_count as i64),
),
];
if !confidences.is_empty() {
let mean = confidences.iter().sum::<f32>() / confidences.len() as f32;
info.push((
"ocr_confidence".to_string(),
MetadataValue::Float(mean as f64),
));
}
if recognized_count > 0 {
(Some(text), Some(info))
} else {
(None, Some(info))
}
}
#[cfg(not(feature = "ocr"))]
{
(None, None)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn repair_strips_trailing_junk() {
let mut bytes = b"%PDF-1.4\n...body...\n%%EOF\n".to_vec();
let original_len = bytes.len();
bytes.extend_from_slice(b"garbage appended by middlebox");
let repaired = repair_trailing_junk(&bytes).expect("should detect trailing junk");
assert_eq!(repaired.len(), original_len);
assert!(repaired.ends_with(b"%%EOF\n"));
}
#[test]
fn repair_returns_none_when_no_junk() {
let bytes = b"%PDF-1.4\n...body...\n%%EOF\n".to_vec();
assert!(repair_trailing_junk(&bytes).is_none());
}
#[test]
fn raw_scan_extracts_literal_strings() {
let content = b"BT (Hello World) Tj ET\n";
let mut blob = Vec::new();
blob.extend_from_slice(b"%PDF-1.4\n");
blob.extend_from_slice(b"4 0 obj << /Length 25 >> stream\n");
blob.extend_from_slice(content);
blob.extend_from_slice(b"endstream endobj\n");
let text = raw_scan_text(&blob);
assert!(text.contains("Hello World"), "got {:?}", text);
}
#[test]
fn raw_scan_extracts_tj_array() {
let content = b"BT [(Hel) -300 (lo)] TJ ET\n";
let mut blob = Vec::new();
blob.extend_from_slice(b"%PDF-1.4\n");
blob.extend_from_slice(b"4 0 obj << /Length 26 >> stream\n");
blob.extend_from_slice(content);
blob.extend_from_slice(b"endstream endobj\n");
let text = raw_scan_text(&blob);
assert!(text.contains("Hello"), "got {:?}", text);
}
#[test]
fn raw_scan_decodes_lzw_stream() {
use weezl::{encode::Encoder, BitOrder};
let content = b"BT (LZW worked) Tj ET\n";
let compressed = Encoder::new(BitOrder::Msb, 8).encode(content).unwrap();
let mut blob = Vec::new();
blob.extend_from_slice(b"%PDF-1.4\n");
blob.extend_from_slice(b"4 0 obj << /Filter /LZWDecode >> stream\n");
blob.extend_from_slice(&compressed);
blob.extend_from_slice(b"\nendstream endobj\n");
let text = raw_scan_text(&blob);
assert!(text.contains("LZW worked"), "got {:?}", text);
}
#[test]
fn raw_scan_decodes_ascii85_stream() {
let content = b"BT (ASCII85 worked) Tj ET\n";
let encoded = ascii85::encode(content);
let mut blob = Vec::new();
blob.extend_from_slice(b"%PDF-1.4\n");
blob.extend_from_slice(b"4 0 obj << /Filter /ASCII85Decode >> stream\n");
blob.extend_from_slice(encoded.as_bytes());
blob.extend_from_slice(b"\nendstream endobj\n");
let text = raw_scan_text(&blob);
assert!(text.contains("ASCII85 worked"), "got {:?}", text);
}
#[test]
fn looks_like_text_accepts_real_text() {
assert!(looks_like_text("Hello PDF\nWorld"));
assert!(looks_like_text("The quick brown fox jumps over the lazy dog"));
}
#[test]
fn looks_like_text_rejects_glyph_indices() {
let garbage: String = (1u8..=12).map(|b| b as char).collect();
assert!(!looks_like_text(&garbage));
}
#[test]
fn looks_like_text_rejects_short_alphanumeric_islands() {
let s = "\x01\x02\x03ab\x04\x05\x06";
assert!(!looks_like_text(s));
}
#[test]
fn literal_string_handles_escapes() {
let input = b"(Hello\\nworld\\(paren\\))";
let (s, _) = read_literal_string(input, 0);
assert_eq!(s, "Hello\nworld(paren)");
}
}