use serde::Serialize;
use std::collections::HashMap;
#[doc(hidden)]
#[derive(Debug, Clone)]
pub enum PdfInput {
Path(String),
Bytes(Vec<u8>),
}
#[derive(Debug, Clone, Default, Serialize)]
pub struct TextItem {
pub text: String,
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
pub rotation: f32,
pub font_name: Option<String>,
pub font_size: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub font_height: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub font_ascent: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub font_descent: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub font_weight: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub font_flags: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub text_width: Option<f32>,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub font_is_buggy: bool,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub has_unicode_map_error: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub mcid: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub fill_color: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub stroke_color: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub confidence: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub link: Option<String>,
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub strike: bool,
}
#[doc(hidden)]
#[derive(Debug, Serialize)]
pub struct Page {
pub page_number: usize,
pub page_width: f32,
pub page_height: f32,
pub text_items: Vec<TextItem>,
#[serde(skip)]
pub graphics: Vec<GraphicPrimitive>,
#[serde(skip)]
pub struct_nodes: Vec<StructNode>,
#[serde(skip)]
pub image_refs: Vec<ImageRef>,
}
#[doc(hidden)]
#[derive(Debug, Clone)]
pub struct OutlineTarget {
pub level: u8,
pub title: String,
pub page_index: i32,
pub y_pdf: Option<f32>,
}
#[doc(hidden)]
#[derive(Debug, Clone)]
pub struct StructNode {
pub role: String,
pub mcids: Vec<i32>,
pub bbox: Option<Rect>,
pub alt_text: Option<String>,
}
#[derive(Debug, Serialize)]
pub struct ParsedPage {
pub page_number: usize,
pub page_width: f32,
pub page_height: f32,
pub text: String,
pub text_items: Vec<TextItem>,
#[serde(skip)]
pub projected_lines: Vec<ProjectedLine>,
#[serde(skip)]
pub regions: Region,
#[serde(skip)]
pub graphics: Vec<GraphicPrimitive>,
#[serde(skip)]
pub figures: Vec<Rect>,
#[serde(skip)]
pub struct_nodes: Vec<StructNode>,
#[serde(skip)]
pub image_refs: Vec<ImageRef>,
}
#[doc(hidden)]
#[derive(Debug, Clone)]
pub struct ImageRef {
pub id: String,
pub bbox: Rect,
pub obj_index: usize,
}
#[derive(Debug, Clone, Serialize)]
pub struct ExtractedImage {
pub id: String,
pub page: u32,
pub bbox: Rect,
pub format: String,
#[serde(skip)]
pub bytes: Vec<u8>,
}
#[doc(hidden)]
#[derive(Debug, Clone, Default, Serialize)]
pub struct Rect {
pub x: f32,
pub y: f32,
pub width: f32,
pub height: f32,
}
#[doc(hidden)]
#[derive(Debug, Clone)]
pub enum GraphicPrimitive {
Stroke {
x1: f32,
y1: f32,
x2: f32,
y2: f32,
color: Option<String>,
width: f32,
},
Rect {
bbox: Rect,
fill: Option<String>,
stroke: Option<String>,
},
}
impl GraphicPrimitive {
pub fn bbox(&self) -> Rect {
match self {
GraphicPrimitive::Stroke { x1, y1, x2, y2, .. } => {
let x = x1.min(*x2);
let y = y1.min(*y2);
Rect {
x,
y,
width: (x2 - x1).abs(),
height: (y2 - y1).abs(),
}
}
GraphicPrimitive::Rect { bbox, .. } => bbox.clone(),
}
}
}
#[doc(hidden)]
#[derive(Debug, Clone, Serialize)]
pub struct ProjectedLine {
pub text: String,
pub bbox: Rect,
pub anchor: Anchor,
pub indent_x: f32,
pub dominant_font_size: f32,
pub font_size_is_estimated: bool,
pub heading_font_size: Option<f32>,
pub dominant_font_name: Option<String>,
pub all_bold: bool,
pub all_italic: bool,
pub all_mono: bool,
pub all_strike: bool,
pub spans: Vec<TextItem>,
pub region_path: Vec<u16>,
pub mcid: Option<i32>,
pub in_figure: bool,
}
#[doc(hidden)]
#[derive(Debug, Clone, Default)]
pub struct Region {
pub bbox: Rect,
pub kind: RegionKind,
}
#[doc(hidden)]
#[derive(Debug, Clone)]
pub enum RegionKind {
Leaf {
item_indices: Vec<usize>,
},
Split {
axis: CutAxis,
children: Vec<Region>,
},
}
impl Default for RegionKind {
fn default() -> Self {
RegionKind::Leaf {
item_indices: Vec::new(),
}
}
}
#[doc(hidden)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CutAxis {
Horizontal,
Vertical,
}
#[doc(hidden)]
#[derive(Debug, Serialize)]
pub enum Snap {
Left,
Right,
Center,
}
#[doc(hidden)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub enum Anchor {
Left,
Right,
Center,
Floating,
}
#[doc(hidden)]
#[derive(Debug, Serialize)]
pub struct ProjectedTextItem {
pub item: TextItem,
pub snap: Snap,
pub anchor: Anchor,
pub is_dup: bool,
pub rendered: bool,
pub num_spaces: usize,
pub force_unsnapped: bool,
pub is_margin_line_number: bool,
pub rotated: bool,
pub d: f32,
pub orig_x: f32,
pub orig_y: f32,
pub orig_width: f32,
pub orig_height: f32,
pub orig_rotation: f32,
}
#[doc(hidden)]
pub type AnchorMap = HashMap<i32, Vec<(usize, usize)>>;
#[cfg(test)]
mod tests {
use super::*;
fn sample_item() -> TextItem {
TextItem {
text: "hi".into(),
x: 1.0,
y: 2.0,
width: 10.0,
height: 4.0,
font_name: Some("Arial".into()),
font_size: Some(12.0),
..Default::default()
}
}
#[test]
fn text_item_skips_none_fields() {
let item = sample_item();
let s = serde_json::to_string(&item).unwrap();
assert!(!s.contains("font_height"));
assert!(!s.contains("confidence"));
assert!(!s.contains("font_is_buggy"));
assert!(s.contains("\"text\":\"hi\""));
}
#[test]
fn text_item_includes_buggy_flag_when_true() {
let mut item = sample_item();
item.font_is_buggy = true;
let s = serde_json::to_string(&item).unwrap();
assert!(s.contains("font_is_buggy"));
}
#[test]
fn page_serializes() {
let p = Page {
page_number: 1,
page_width: 100.0,
page_height: 200.0,
text_items: vec![sample_item()],
graphics: vec![],
struct_nodes: vec![],
image_refs: vec![],
};
let s = serde_json::to_string(&p).unwrap();
assert!(s.contains("\"page_number\":1"));
}
#[test]
fn anchor_map_basic() {
let mut m: AnchorMap = HashMap::new();
m.entry(5).or_default().push((1, 2));
assert_eq!(m.get(&5).unwrap()[0], (1, 2));
}
}