pub mod backend;
mod docx;
pub mod error;
pub mod ir;
mod layout;
mod output;
mod pdf;
pub use error::{PdfmuseError, Result};
pub use output::{chunk, to_json, to_markdown, to_text, Chunk};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Format {
Pdf,
Docx,
}
pub fn parse(data: &[u8], fmt: Option<Format>) -> Result<ir::Document> {
parse_with_password(data, fmt, None)
}
pub fn parse_with_password(
data: &[u8],
fmt: Option<Format>,
password: Option<&str>,
) -> Result<ir::Document> {
match fmt.or_else(|| detect_format(data)) {
Some(Format::Pdf) => {
let mut doc = pdf::parse_pdf(data, password)?;
layout_pages(&mut doc);
Ok(doc)
}
Some(Format::Docx) => docx::parse(data),
None => Err(PdfmuseError::InvalidFormat),
}
}
fn layout_pages(doc: &mut ir::Document) {
#[cfg(feature = "rayon")]
{
use rayon::prelude::*;
doc.pages.par_iter_mut().for_each(layout::layout_page);
}
#[cfg(not(feature = "rayon"))]
doc.pages.iter_mut().for_each(layout::layout_page);
}
fn detect_format(data: &[u8]) -> Option<Format> {
if data.starts_with(b"PK\x03\x04") {
return Some(Format::Docx); }
let head = &data[..data.len().min(1024)];
if head.windows(5).any(|w| w == b"%PDF-") {
return Some(Format::Pdf);
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_pdf_and_docx_magic() {
assert_eq!(detect_format(b"%PDF-1.7\ntrailer"), Some(Format::Pdf));
assert_eq!(detect_format(b"PK\x03\x04rest"), Some(Format::Docx));
assert_eq!(detect_format(b"not a document"), None);
}
#[test]
fn docx_magic_routes_to_docx_parser() {
assert!(matches!(
parse(b"PK\x03\x04", None).unwrap_err(),
PdfmuseError::Malformed(_)
));
}
#[test]
fn unknown_bytes_are_invalid_format() {
assert!(matches!(
parse(b"garbage", None).unwrap_err(),
PdfmuseError::InvalidFormat
));
}
}