pub mod convert;
pub mod detect;
pub mod error;
pub mod model;
pub mod parser;
pub mod render;
#[cfg(feature = "ffi")]
pub mod ffi;
pub use convert::{
ConvertOptions, ConvertResult, ConverterRegistry, DocumentConverter, OutputFormat,
};
pub use detect::{detect_format_from_bytes, detect_format_from_path, is_pdf, PdfFormat};
pub use error::{Error, Result};
pub use model::{
Alignment, Block, Document, ExtractionQuality, FieldType, FieldValue, FormField, InlineContent,
ListInfo, Metadata, Outline, Page, Paragraph, ParagraphStyle, QualityAccumulator, Resource,
ResourceType, Table, TableCell, TableRow, TextRun, TextStyle,
};
pub use parser::{PageStreamOptions, ParseEvent, ParseOptions, PdfParser};
pub use render::{
CleanupOptions, CleanupPreset, HeadingConfig, JsonFormat, PageSelection, RenderOptions,
TableFallback,
};
use std::io::Read;
use std::path::Path;
pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<Document> {
let parser = PdfParser::open(path)?;
parser.parse()
}
pub fn parse_file_with_options<P: AsRef<Path>>(path: P, options: ParseOptions) -> Result<Document> {
let parser = PdfParser::open_with_options(path, options)?;
parser.parse()
}
pub fn parse_bytes(data: &[u8]) -> Result<Document> {
let parser = PdfParser::from_bytes(data)?;
parser.parse()
}
pub fn parse_bytes_with_options(data: &[u8], options: ParseOptions) -> Result<Document> {
let parser = PdfParser::from_bytes_with_options(data, options)?;
parser.parse()
}
pub fn parse_reader<R: Read>(reader: R) -> Result<Document> {
let parser = PdfParser::from_reader(reader)?;
parser.parse()
}
pub fn parse_reader_with_options<R: Read>(reader: R, options: ParseOptions) -> Result<Document> {
let parser = PdfParser::from_reader_with_options(reader, options)?;
parser.parse()
}
pub fn parse_file_with_password<P: AsRef<Path>>(path: P, password: &str) -> Result<Document> {
let options = ParseOptions::new().with_password(password);
parse_file_with_options(path, options)
}
pub fn extract_text<P: AsRef<Path>>(path: P) -> Result<String> {
let doc = parse_file(path)?;
Ok(doc.plain_text())
}
pub fn to_markdown<P: AsRef<Path>>(path: P) -> Result<String> {
let doc = parse_file(path)?;
let options = RenderOptions::default();
render::to_markdown(&doc, &options)
}
pub fn to_markdown_with_options<P: AsRef<Path>>(
path: P,
options: &RenderOptions,
) -> Result<String> {
let doc = parse_file(path)?;
render::to_markdown(&doc, options)
}
pub fn to_text<P: AsRef<Path>>(path: P, options: &RenderOptions) -> Result<String> {
let doc = parse_file(path)?;
render::to_text(&doc, options)
}
pub fn to_json<P: AsRef<Path>>(path: P, format: JsonFormat) -> Result<String> {
let doc = parse_file(path)?;
render::to_json(&doc, format)
}
pub struct Unpdf {
parse_options: ParseOptions,
render_options: RenderOptions,
}
impl Unpdf {
pub fn new() -> Self {
Self {
parse_options: ParseOptions::default(),
render_options: RenderOptions::default(),
}
}
pub fn lenient(mut self) -> Self {
self.parse_options = self.parse_options.lenient();
self
}
pub fn text_only(mut self) -> Self {
self.parse_options = self.parse_options.text_only();
self
}
pub fn sequential(mut self) -> Self {
self.parse_options = self.parse_options.sequential();
self
}
pub fn with_images(mut self, extract: bool) -> Self {
self.parse_options = self.parse_options.with_resources(extract);
self
}
pub fn with_image_dir(mut self, dir: impl Into<std::path::PathBuf>) -> Self {
self.render_options = self.render_options.with_image_dir(dir);
self
}
pub fn with_frontmatter(mut self) -> Self {
self.render_options = self.render_options.with_frontmatter(true);
self
}
pub fn with_table_fallback(mut self, fallback: TableFallback) -> Self {
self.render_options = self.render_options.with_table_fallback(fallback);
self
}
pub fn with_cleanup(mut self, preset: CleanupPreset) -> Self {
self.render_options = self.render_options.with_cleanup_preset(preset);
self
}
pub fn with_password(mut self, password: impl Into<String>) -> Self {
self.parse_options = self.parse_options.with_password(password);
self
}
pub fn with_pages(mut self, pages: PageSelection) -> Self {
self.parse_options = self.parse_options.with_pages(pages.clone());
self.render_options = self.render_options.with_pages(pages);
self
}
pub fn parse<P: AsRef<Path>>(self, path: P) -> Result<UnpdfResult> {
let parser = PdfParser::open_with_options(path, self.parse_options)?;
let document = parser.parse()?;
Ok(UnpdfResult {
document,
render_options: self.render_options,
})
}
pub fn parse_bytes(self, data: &[u8]) -> Result<UnpdfResult> {
let parser = PdfParser::from_bytes_with_options(data, self.parse_options)?;
let document = parser.parse()?;
Ok(UnpdfResult {
document,
render_options: self.render_options,
})
}
}
impl Default for Unpdf {
fn default() -> Self {
Self::new()
}
}
pub struct UnpdfResult {
pub document: Document,
render_options: RenderOptions,
}
impl UnpdfResult {
pub fn to_markdown(&self) -> Result<String> {
render::to_markdown(&self.document, &self.render_options)
}
pub fn to_text(&self) -> Result<String> {
render::to_text(&self.document, &self.render_options)
}
pub fn to_json(&self, format: JsonFormat) -> Result<String> {
render::to_json(&self.document, format)
}
pub fn plain_text(&self) -> String {
self.document.plain_text()
}
pub fn document(&self) -> &Document {
&self.document
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unpdf_builder() {
let unpdf = Unpdf::new()
.lenient()
.with_frontmatter()
.with_cleanup(CleanupPreset::Standard);
assert!(matches!(
unpdf.parse_options.error_mode,
parser::ErrorMode::Lenient
));
assert!(unpdf.render_options.include_frontmatter);
}
#[test]
fn test_parse_bytes_empty_data() {
let data: [u8; 0] = [];
let result = parse_bytes(&data);
assert!(result.is_err());
}
#[test]
fn test_parse_bytes_too_short() {
let data = b"%PDF";
let result = parse_bytes(data);
assert!(result.is_err());
}
#[test]
fn test_parse_bytes_unknown_magic() {
let data = [0xFF, 0xFE, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07];
let result = parse_bytes(&data);
assert!(result.is_err());
}
#[test]
fn test_detect_format_empty_data() {
let data: [u8; 0] = [];
let result = detect_format_from_bytes(&data);
assert!(result.is_err());
assert!(matches!(result, Err(Error::UnknownFormat)));
}
#[test]
fn test_detect_format_too_short() {
let data = b"%PDF-";
let result = detect_format_from_bytes(data);
assert!(result.is_err());
assert!(matches!(result, Err(Error::UnknownFormat)));
}
#[test]
fn test_detect_format_unknown_magic() {
let data = b"<!DOCTYPE html><html></html>";
let result = detect_format_from_bytes(data);
assert!(result.is_err());
assert!(matches!(result, Err(Error::UnknownFormat)));
}
#[test]
fn test_detect_valid_pdf_17() {
let data = b"%PDF-1.7\n%test";
let format = detect_format_from_bytes(data).unwrap();
assert_eq!(format.version, "1.7");
}
#[test]
fn test_detect_valid_pdf_20() {
let data = b"%PDF-2.0\n%test";
let format = detect_format_from_bytes(data).unwrap();
assert_eq!(format.version, "2.0");
}
#[test]
fn test_is_pdf_bytes() {
assert!(detect::is_pdf_bytes(b"%PDF-1.4\ntest"));
assert!(!detect::is_pdf_bytes(b"Not a PDF file"));
assert!(!detect::is_pdf_bytes(b""));
}
#[test]
fn test_unpdf_builder_default() {
let builder = Unpdf::default();
assert!(!builder.render_options.include_frontmatter);
}
#[test]
fn test_unpdf_builder_text_only() {
let builder = Unpdf::new().text_only();
assert!(matches!(
builder.parse_options.extract_mode,
parser::ExtractMode::TextOnly
));
}
#[test]
fn test_unpdf_builder_sequential() {
let builder = Unpdf::new().sequential();
assert!(!builder.parse_options.parallel);
}
#[test]
fn test_unpdf_builder_with_password() {
let builder = Unpdf::new().with_password("secret");
assert_eq!(builder.parse_options.password, Some("secret".to_string()));
}
#[test]
fn test_unpdf_builder_with_pages() {
let builder = Unpdf::new().with_pages(PageSelection::Range(1..=5));
assert!(matches!(
builder.render_options.page_selection,
PageSelection::Range(_)
));
}
#[test]
fn test_unpdf_builder_with_table_fallback() {
let builder = Unpdf::new().with_table_fallback(TableFallback::Html);
assert!(matches!(
builder.render_options.table_fallback,
TableFallback::Html
));
}
#[test]
fn test_unpdf_builder_chained() {
let builder = Unpdf::new()
.lenient()
.with_frontmatter()
.with_cleanup(CleanupPreset::Aggressive)
.with_table_fallback(TableFallback::Ascii)
.sequential();
assert!(matches!(
builder.parse_options.error_mode,
parser::ErrorMode::Lenient
));
assert!(builder.render_options.include_frontmatter);
assert!(!builder.parse_options.parallel);
}
#[test]
fn test_unpdf_builder_parse_invalid_bytes() {
let result = Unpdf::new().parse_bytes(b"not a pdf");
assert!(result.is_err());
}
#[test]
fn test_render_options_defaults() {
let options = RenderOptions::default();
assert!(!options.include_frontmatter);
}
#[test]
fn test_render_options_with_image_dir() {
use std::path::PathBuf;
let options = RenderOptions::new().with_image_dir("./images");
assert_eq!(options.image_dir, Some(PathBuf::from("./images")));
}
#[test]
fn test_cleanup_preset_variants() {
let _minimal = RenderOptions::new().with_cleanup_preset(CleanupPreset::Minimal);
let _standard = RenderOptions::new().with_cleanup_preset(CleanupPreset::Standard);
let _aggressive = RenderOptions::new().with_cleanup_preset(CleanupPreset::Aggressive);
}
#[test]
fn test_json_format_variants() {
let _pretty = JsonFormat::Pretty;
let _compact = JsonFormat::Compact;
}
#[test]
fn test_page_selection_all() {
let selection = PageSelection::All;
assert!(matches!(selection, PageSelection::All));
}
}