use crate::document::PdfDocument;
use crate::error::{Error, Result};
use crate::geometry::Rect;
use crate::layout::TextBlock;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StructuredDocument {
pub elements: Vec<DocumentElement>,
pub page_size: (f32, f32),
pub metadata: DocumentMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum DocumentElement {
#[serde(rename = "header")]
Header {
level: u8,
text: String,
style: TextStyle,
bbox: BoundingBox,
},
#[serde(rename = "paragraph")]
Paragraph {
text: String,
style: TextStyle,
bbox: BoundingBox,
alignment: TextAlignment,
},
#[serde(rename = "list")]
List {
items: Vec<ListItem>,
ordered: bool,
bbox: BoundingBox,
},
#[serde(rename = "table")]
Table {
rows: usize,
cols: usize,
cells: Vec<Vec<String>>,
bbox: BoundingBox,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ListItem {
pub text: String,
pub style: TextStyle,
pub nested: Option<Box<DocumentElement>>,
pub bbox: BoundingBox,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextStyle {
pub font_family: String,
pub font_size: f32,
pub bold: bool,
pub italic: bool,
pub color: (f32, f32, f32),
}
impl Default for TextStyle {
fn default() -> Self {
Self {
font_family: "Unknown".to_string(),
font_size: 12.0,
bold: false,
italic: false,
color: (0.0, 0.0, 0.0), }
}
}
pub type BoundingBox = (f32, f32, f32, f32);
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum TextAlignment {
Left,
Center,
Right,
Justified,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMetadata {
pub element_count: usize,
pub header_count: usize,
pub paragraph_count: usize,
pub list_count: usize,
pub table_count: usize,
}
#[allow(dead_code)]
pub struct StructuredExtractor {
config: ExtractorConfig,
}
#[derive(Debug, Clone)]
pub struct ExtractorConfig {
#[allow(dead_code)]
pub min_header_size: f32,
#[allow(dead_code)]
pub max_header_levels: u8,
#[allow(dead_code)]
pub paragraph_gap_threshold: f32,
#[allow(dead_code)]
pub detect_lists: bool,
#[allow(dead_code)]
pub detect_tables: bool,
}
impl Default for ExtractorConfig {
fn default() -> Self {
Self {
min_header_size: 14.0,
max_header_levels: 6,
paragraph_gap_threshold: 1.5,
detect_lists: false,
detect_tables: false,
}
}
}
impl StructuredExtractor {
pub fn new() -> Self {
Self {
config: ExtractorConfig::default(),
}
}
pub fn with_config(config: ExtractorConfig) -> Self {
Self { config }
}
pub fn extract_page(
&mut self,
document: &mut PdfDocument,
page_num: u32,
) -> Result<StructuredDocument> {
let spans = document.extract_spans(page_num as usize)?;
if spans.is_empty() {
return Ok(StructuredDocument {
elements: Vec::new(),
page_size: (0.0, 0.0),
metadata: DocumentMetadata {
element_count: 0,
header_count: 0,
paragraph_count: 0,
list_count: 0,
table_count: 0,
},
});
}
let blocks = self.spans_to_blocks(&spans);
let elements = self.blocks_to_simple_paragraphs(&blocks);
let page_size = self.calculate_page_size_from_spans(&spans);
let metadata = self.calculate_metadata(&elements);
Ok(StructuredDocument {
elements,
page_size,
metadata,
})
}
fn spans_to_blocks(&self, spans: &[crate::layout::TextSpan]) -> Vec<TextBlock> {
spans
.iter()
.map(|span| TextBlock {
chars: Vec::new(), bbox: span.bbox,
text: span.text.clone(),
avg_font_size: span.font_size,
dominant_font: span.font_name.clone(),
is_bold: span.font_weight == crate::layout::FontWeight::Bold,
is_italic: span.is_italic,
mcid: span.mcid,
})
.collect()
}
fn calculate_page_size_from_spans(&self, spans: &[crate::layout::TextSpan]) -> (f32, f32) {
if spans.is_empty() {
return (0.0, 0.0);
}
let mut max_x = 0.0f32;
let mut max_y = 0.0f32;
for span in spans {
max_x = max_x.max(span.bbox.x + span.bbox.width);
max_y = max_y.max(span.bbox.y + span.bbox.height);
}
(max_x, max_y)
}
fn blocks_to_simple_paragraphs(&self, blocks: &[TextBlock]) -> Vec<DocumentElement> {
blocks
.iter()
.map(|block| DocumentElement::Paragraph {
text: block.text.clone(),
style: Self::block_to_text_style(block),
bbox: Self::bbox_from_rect(block.bbox),
alignment: TextAlignment::Left, })
.collect()
}
fn block_to_text_style(block: &TextBlock) -> TextStyle {
let bold = block.is_bold || block.dominant_font.contains("Bold");
let italic =
block.dominant_font.contains("Italic") || block.dominant_font.contains("Oblique");
let color = block
.chars
.first()
.map(|c| (c.color.r, c.color.g, c.color.b))
.unwrap_or((0.0, 0.0, 0.0));
TextStyle {
font_family: block.dominant_font.clone(),
font_size: block.avg_font_size,
bold,
italic,
color,
}
}
fn bbox_from_rect(rect: Rect) -> BoundingBox {
(rect.x, rect.y, rect.width, rect.height)
}
fn calculate_metadata(&self, elements: &[DocumentElement]) -> DocumentMetadata {
let mut header_count = 0;
let mut paragraph_count = 0;
let mut list_count = 0;
let mut table_count = 0;
for elem in elements {
match elem {
DocumentElement::Header { .. } => header_count += 1,
DocumentElement::Paragraph { .. } => paragraph_count += 1,
DocumentElement::List { .. } => list_count += 1,
DocumentElement::Table { .. } => table_count += 1,
}
}
DocumentMetadata {
element_count: elements.len(),
header_count,
paragraph_count,
list_count,
table_count,
}
}
}
impl Default for StructuredExtractor {
fn default() -> Self {
Self::new()
}
}
impl StructuredDocument {
pub fn to_plain_text(&self) -> String {
let mut text = String::new();
for element in &self.elements {
match element {
DocumentElement::Header { text: t, .. } => {
if !text.is_empty() {
text.push('\n');
}
text.push_str(t);
text.push('\n');
},
DocumentElement::Paragraph { text: t, .. } => {
if !text.is_empty() {
text.push('\n');
}
text.push_str(t);
},
DocumentElement::List { items, .. } => {
for item in items {
text.push('\n');
text.push_str(&item.text);
}
},
DocumentElement::Table { cells, .. } => {
for row in cells {
text.push('\n');
text.push_str(&row.join("\t"));
}
},
}
}
text
}
pub fn to_json(&self) -> Result<String> {
serde_json::to_string_pretty(self).map_err(|e| Error::ParseError {
offset: 0,
reason: format!("Failed to serialize to JSON: {}", e),
})
}
}