pub mod ast;
pub mod detector;
pub mod parser;
pub mod renderer;
use std::collections::HashMap;
use ast::ConversionResult;
use detector::{build_document, DetectorConfig};
#[derive(Debug, Clone)]
pub struct Papyrus {
config: DetectorConfig,
}
#[derive(Debug, Clone)]
pub struct PapyrusBuilder {
config: DetectorConfig,
}
impl PapyrusBuilder {
pub fn heading_size_ratio(mut self, ratio: f32) -> Self {
self.config.heading_size_ratio = ratio;
self
}
pub fn detect_bold(mut self, enabled: bool) -> Self {
self.config.detect_bold = enabled;
self
}
pub fn detect_italic(mut self, enabled: bool) -> Self {
self.config.detect_italic = enabled;
self
}
pub fn build(self) -> Papyrus {
Papyrus {
config: self.config,
}
}
}
impl Papyrus {
pub fn builder() -> PapyrusBuilder {
PapyrusBuilder {
config: DetectorConfig::default(),
}
}
pub fn extract(&self, pdf_bytes: &[u8]) -> ConversionResult {
extract_with_config(pdf_bytes, &self.config)
}
}
pub fn convert(pdf_bytes: &[u8]) -> ConversionResult {
extract_with_config(pdf_bytes, &DetectorConfig::default())
}
fn extract_with_config(pdf_bytes: &[u8], config: &DetectorConfig) -> ConversionResult {
use ast::{DocumentMetadata, Warning};
let mut all_warnings: Vec<Warning> = Vec::new();
let (doc_opt, load_warnings) = parser::load_pdf(pdf_bytes);
all_warnings.extend(load_warnings);
let doc = match doc_opt {
Some(d) => d,
None => {
let (document, _) = build_document(
Vec::new(),
&HashMap::new(),
config,
DocumentMetadata {
title: None,
author: None,
page_count: 0,
},
);
return ConversionResult {
document,
warnings: all_warnings,
};
}
};
let pages = doc.get_pages();
let page_count = pages.len();
let (title, author) = parser::extract_doc_info_pub(&doc);
let metadata = DocumentMetadata {
title,
author,
page_count,
};
let mut page_fonts_map: HashMap<(usize, Vec<u8>), parser::FontInfo> = HashMap::new();
let mut all_segments: Vec<parser::RawTextSegment> = Vec::new();
let mut page_numbers: Vec<u32> = pages.keys().copied().collect();
page_numbers.sort();
for &page_num in &page_numbers {
let page_number = page_num as usize;
let (fonts, font_warnings) = parser::resolve_fonts_for_page(&doc, page_number);
all_warnings.extend(font_warnings);
for (resource_name, font_info) in fonts {
page_fonts_map.insert((page_number, resource_name), font_info);
}
let (segments, extract_warnings) =
parser::extract_text_segments_for_page(&doc, page_number, &HashMap::new());
all_warnings.extend(extract_warnings);
all_segments.extend(segments);
}
let segment_fonts = build_segment_font_map(&all_segments, &page_fonts_map, &mut all_warnings);
let (document, detector_warnings) =
build_document(all_segments, &segment_fonts, config, metadata);
all_warnings.extend(detector_warnings);
ConversionResult {
document,
warnings: all_warnings,
}
}
fn build_segment_font_map(
segments: &[parser::RawTextSegment],
page_fonts_map: &HashMap<(usize, Vec<u8>), parser::FontInfo>,
warnings: &mut Vec<ast::Warning>,
) -> HashMap<Vec<u8>, parser::FontInfo> {
let mut result: HashMap<Vec<u8>, parser::FontInfo> = HashMap::new();
let mut warned: std::collections::HashSet<Vec<u8>> = std::collections::HashSet::new();
for segment in segments {
let key = (segment.page_number, segment.font_resource_name.clone());
match page_fonts_map.get(&key) {
Some(font_info) => {
result.insert(segment.font_resource_name.clone(), font_info.clone());
}
None => {
if warned.insert(segment.font_resource_name.clone()) {
warnings.push(ast::Warning::MissingFontMetrics {
font_name: String::from_utf8_lossy(&segment.font_resource_name).to_string(),
page: segment.page_number,
});
}
}
}
}
result
}