Skip to main content

bookforge_pdf/
model.rs

1//! Page/line intermediate representation produced by the poppler XML
2//! parser and consumed by reconstruction. Coordinates are pdftohtml's
3//! integer pixel units, top-left origin.
4
5/// A styled run of text within a line fragment.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub struct Span {
8    pub text: String,
9    pub bold: bool,
10    pub italic: bool,
11}
12
13/// One `<text>` fragment from pdftohtml, already a visual line or part
14/// of one.
15#[derive(Debug, Clone, PartialEq)]
16pub struct Fragment {
17    pub top: i32,
18    pub left: i32,
19    pub width: i32,
20    pub height: i32,
21    pub font: u32,
22    pub spans: Vec<Span>,
23}
24
25impl Fragment {
26    pub fn right(&self) -> i32 {
27        self.left + self.width
28    }
29
30    pub fn char_count(&self) -> usize {
31        self.spans
32            .iter()
33            .map(|span| span.text.chars().filter(|ch| !ch.is_whitespace()).count())
34            .sum()
35    }
36}
37
38/// A merged visual line (one or more fragments at the same height).
39#[derive(Debug, Clone, PartialEq)]
40pub struct Line {
41    pub top: i32,
42    pub left: i32,
43    pub right: i32,
44    pub height: i32,
45    pub font_size: u32,
46    pub spans: Vec<Span>,
47}
48
49impl Line {
50    pub fn width(&self) -> i32 {
51        self.right - self.left
52    }
53
54    pub fn text(&self) -> String {
55        self.spans
56            .iter()
57            .map(|span| span.text.as_str())
58            .collect::<String>()
59    }
60
61    pub fn char_count(&self) -> usize {
62        self.spans
63            .iter()
64            .map(|span| span.text.chars().filter(|ch| !ch.is_whitespace()).count())
65            .sum()
66    }
67}
68
69#[derive(Debug, Clone, PartialEq)]
70pub struct Page {
71    pub number: u32,
72    pub width: i32,
73    pub height: i32,
74    pub fragments: Vec<Fragment>,
75    /// font id -> point size, from `<fontspec>` declarations.
76    pub font_sizes: std::collections::HashMap<u32, u32>,
77}
78
79/// Column handling requested on the CLI.
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
81pub enum ColumnMode {
82    #[default]
83    Auto,
84    Single,
85    Two,
86}
87
88/// A reconstructed, reading-ordered document block ready for XHTML
89/// emission.
90#[derive(Debug, Clone, PartialEq, Eq)]
91pub enum DocBlock {
92    Heading { level: u8, spans: Vec<Span> },
93    Paragraph { spans: Vec<Span> },
94}
95
96impl DocBlock {
97    pub fn spans(&self) -> &[Span] {
98        match self {
99            DocBlock::Heading { spans, .. } => spans,
100            DocBlock::Paragraph { spans } => spans,
101        }
102    }
103
104    pub fn text(&self) -> String {
105        self.spans()
106            .iter()
107            .map(|span| span.text.as_str())
108            .collect::<String>()
109    }
110
111    pub fn char_count(&self) -> usize {
112        self.spans()
113            .iter()
114            .map(|span| span.text.chars().filter(|ch| !ch.is_whitespace()).count())
115            .sum()
116    }
117}