mod cmap;
mod content;
mod content_lex;
mod object_parser;
mod lazy;
mod fonts;
mod graphics;
mod objects;
mod tables;
use lopdf::ObjectId;
use crate::error::Result;
use crate::ir::{Document, Metadata, Page, SourceKind, Warning, WarningKind};
use objects::PdfDoc;
pub(crate) fn parse_pdf(data: &[u8], password: Option<&str>) -> Result<Document> {
let (pdf, warnings) = PdfDoc::load(data, password)?;
let pages: Vec<(u32, ObjectId)> = pdf.pages().into_iter().collect();
let mut out = Document {
source: SourceKind::Pdf,
metadata: Metadata { page_count: pages.len() as u32, ..Default::default() },
warnings, ..Default::default()
};
for (page, mut page_warnings) in extract_pages(&pdf, &pages) {
out.warnings.append(&mut page_warnings);
out.pages.push(page);
}
Ok(out)
}
fn extract_one(pdf: &PdfDoc<'_>, page_number: u32, page_id: ObjectId) -> (Page, Vec<Warning>) {
let index = page_number.saturating_sub(1);
let mut page = Page { index, ..Default::default() };
if let Some([x0, y0, x1, y1]) = pdf.media_box(page_id) {
page.width = (x1 - x0).abs();
page.height = (y1 - y0).abs();
}
let pc = content::extract_page(pdf, page_id, index, page.height);
page.chars = pc.chars;
page.rects = pc.rects;
page.rules = pc.rules;
let mut warnings = pc.warnings;
if page.chars.is_empty() && pdf.page_has_image(page_id) {
warnings.push(Warning {
page: Some(index),
kind: WarningKind::NeedsOcr,
detail: "page has no text layer (scanned); needs an OCR backend".into(),
});
}
(page, warnings)
}
fn extract_pages(pdf: &PdfDoc<'_>, pages: &[(u32, ObjectId)]) -> Vec<(Page, Vec<Warning>)> {
#[cfg(feature = "rayon")]
{
use rayon::prelude::*;
pages.par_iter().map(|&(n, id)| extract_one(pdf, n, id)).collect()
}
#[cfg(not(feature = "rayon"))]
{
pages.iter().map(|&(n, id)| extract_one(pdf, n, id)).collect()
}
}
#[cfg(test)]
mod tests {
use super::parse_pdf;
use crate::ir::{SourceKind, WarningKind};
use lopdf::content::{Content, Operation};
use lopdf::{dictionary, Document as LoDoc, Object, Stream};
fn sample_pdf(text: &str) -> Vec<u8> {
let mut doc = LoDoc::with_version("1.5");
let pages_id = doc.new_object_id();
let font_id = doc.add_object(dictionary! {
"Type" => "Font", "Subtype" => "Type1", "BaseFont" => "Courier",
});
let resources_id = doc.add_object(dictionary! {
"Font" => dictionary! { "F1" => font_id },
});
let content = Content {
operations: vec![
Operation::new("BT", vec![]),
Operation::new("Tf", vec!["F1".into(), 24.into()]),
Operation::new("Td", vec![100.into(), 700.into()]),
Operation::new("Tj", vec![Object::string_literal(text)]),
Operation::new("ET", vec![]),
],
};
let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
let page_id = doc.add_object(dictionary! {
"Type" => "Page",
"Parent" => pages_id,
"Contents" => content_id,
"Resources" => resources_id,
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
});
let pages = dictionary! {
"Type" => "Pages",
"Kids" => vec![page_id.into()],
"Count" => 1,
};
doc.objects.insert(pages_id, Object::Dictionary(pages));
let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id });
doc.trailer.set("Root", catalog_id);
let mut buf = Vec::new();
doc.save_to(&mut buf).unwrap();
buf
}
#[test]
fn extracts_positioned_chars_from_digital_pdf() {
let bytes = sample_pdf("Hello pdfmuse");
let doc = parse_pdf(&bytes, None).expect("parses a digital PDF");
assert_eq!(doc.source, SourceKind::Pdf);
assert_eq!(doc.pages.len(), 1);
assert_eq!(doc.metadata.page_count, 1);
assert!(doc.warnings.is_empty(), "unexpected warnings: {:?}", doc.warnings);
assert_eq!((doc.pages[0].width, doc.pages[0].height), (612.0, 792.0));
let chars = &doc.pages[0].chars;
let text: String = chars.iter().map(|c| c.text.as_str()).collect();
assert_eq!(text, "Hello pdfmuse");
let first = &chars[0];
assert_eq!(first.text, "H");
assert_eq!(first.size, 24.0);
assert!((first.bbox.x0 - 100.0).abs() < 0.5, "x0 = {}", first.bbox.x0);
assert!(first.bbox.y1 > first.bbox.y0, "bbox should have positive height");
assert!((chars[1].bbox.x0 - 114.4).abs() < 0.5, "second glyph x0 = {}", chars[1].bbox.x0);
}
fn sample_pdf_with_rect() -> Vec<u8> {
let mut doc = LoDoc::with_version("1.5");
let pages_id = doc.new_object_id();
let content = Content {
operations: vec![
Operation::new("re", vec![100.into(), 100.into(), 200.into(), 50.into()]),
Operation::new("S", vec![]),
],
};
let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
let page_id = doc.add_object(dictionary! {
"Type" => "Page",
"Parent" => pages_id,
"Contents" => content_id,
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
});
let pages = dictionary! { "Type" => "Pages", "Kids" => vec![page_id.into()], "Count" => 1 };
doc.objects.insert(pages_id, Object::Dictionary(pages));
let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id });
doc.trailer.set("Root", catalog_id);
let mut buf = Vec::new();
doc.save_to(&mut buf).unwrap();
buf
}
#[test]
fn collects_vector_rectangles() {
let doc = parse_pdf(&sample_pdf_with_rect(), None).expect("parses");
let rects = &doc.pages[0].rects;
assert_eq!(rects.len(), 1);
let b = rects[0].bbox;
assert!((b.x0 - 100.0).abs() < 0.5, "x0 = {}", b.x0);
assert!((b.x1 - 300.0).abs() < 0.5, "x1 = {}", b.x1);
assert!((b.y0 - 642.0).abs() < 0.5 && (b.y1 - 692.0).abs() < 0.5, "y = {},{}", b.y0, b.y1);
}
fn sample_pdf_scanned() -> Vec<u8> {
let mut doc = LoDoc::with_version("1.5");
let pages_id = doc.new_object_id();
let img = doc.add_object(Stream::new(
dictionary! {
"Type" => "XObject", "Subtype" => "Image",
"Width" => 1, "Height" => 1, "BitsPerComponent" => 8, "ColorSpace" => "DeviceGray",
},
vec![0u8],
));
let resources_id = doc.add_object(dictionary! { "XObject" => dictionary! { "Im1" => img } });
let content = Content {
operations: vec![
Operation::new("q", vec![]),
Operation::new("Do", vec![Object::Name(b"Im1".to_vec())]),
Operation::new("Q", vec![]),
],
};
let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
let page_id = doc.add_object(dictionary! {
"Type" => "Page", "Parent" => pages_id, "Contents" => content_id,
"Resources" => resources_id,
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
});
let pages = dictionary! { "Type" => "Pages", "Kids" => vec![page_id.into()], "Count" => 1 };
doc.objects.insert(pages_id, Object::Dictionary(pages));
let catalog_id = doc.add_object(dictionary! { "Type" => "Catalog", "Pages" => pages_id });
doc.trailer.set("Root", catalog_id);
let mut buf = Vec::new();
doc.save_to(&mut buf).unwrap();
buf
}
#[test]
fn scanned_page_warns_needs_ocr() {
let doc = parse_pdf(&sample_pdf_scanned(), None).expect("parses");
assert!(doc.pages[0].chars.is_empty(), "scanned page should have no text");
assert!(
doc.warnings.iter().any(|w| w.kind == WarningKind::NeedsOcr),
"expected NeedsOcr; got {:?}",
doc.warnings
);
}
}