1use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::ParseResult;
9use std::io::{Read, Seek};
10
11#[derive(Debug, Clone)]
13pub struct ExtractionOptions {
14 pub preserve_layout: bool,
16 pub space_threshold: f64,
18 pub newline_threshold: f64,
20}
21
22impl Default for ExtractionOptions {
23 fn default() -> Self {
24 Self {
25 preserve_layout: false,
26 space_threshold: 0.2,
27 newline_threshold: 10.0,
28 }
29 }
30}
31
32#[derive(Debug, Clone)]
34pub struct ExtractedText {
35 pub text: String,
37 pub fragments: Vec<TextFragment>,
39}
40
41#[derive(Debug, Clone)]
43pub struct TextFragment {
44 pub text: String,
46 pub x: f64,
48 pub y: f64,
50 pub width: f64,
52 pub height: f64,
54 pub font_size: f64,
56}
57
58struct TextState {
60 text_matrix: [f64; 6],
62 text_line_matrix: [f64; 6],
64 #[allow(dead_code)]
66 ctm: [f64; 6],
67 leading: f64,
69 char_space: f64,
71 word_space: f64,
73 horizontal_scale: f64,
75 text_rise: f64,
77 font_size: f64,
79 font_name: Option<String>,
81 render_mode: u8,
83}
84
85impl Default for TextState {
86 fn default() -> Self {
87 Self {
88 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
89 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
90 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
91 leading: 0.0,
92 char_space: 0.0,
93 word_space: 0.0,
94 horizontal_scale: 100.0,
95 text_rise: 0.0,
96 font_size: 0.0,
97 font_name: None,
98 render_mode: 0,
99 }
100 }
101}
102
103pub struct TextExtractor {
105 options: ExtractionOptions,
106}
107
108impl TextExtractor {
109 pub fn new() -> Self {
111 Self {
112 options: ExtractionOptions::default(),
113 }
114 }
115
116 pub fn with_options(options: ExtractionOptions) -> Self {
118 Self { options }
119 }
120
121 pub fn extract_from_document<R: Read + Seek>(
123 &self,
124 document: &PdfDocument<R>,
125 ) -> ParseResult<Vec<ExtractedText>> {
126 let page_count = document.page_count()?;
127 let mut results = Vec::new();
128
129 for i in 0..page_count {
130 let text = self.extract_from_page(document, i)?;
131 results.push(text);
132 }
133
134 Ok(results)
135 }
136
137 pub fn extract_from_page<R: Read + Seek>(
139 &self,
140 document: &PdfDocument<R>,
141 page_index: u32,
142 ) -> ParseResult<ExtractedText> {
143 let page = document.get_page(page_index)?;
145
146 let streams = page.content_streams_with_document(document)?;
148
149 let mut extracted_text = String::new();
150 let mut fragments = Vec::new();
151 let mut state = TextState::default();
152 let mut in_text_object = false;
153 let mut last_x = 0.0;
154 let mut last_y = 0.0;
155
156 for stream_data in streams {
158 let operations = ContentParser::parse_content(&stream_data)?;
159
160 for op in operations {
161 match op {
162 ContentOperation::BeginText => {
163 in_text_object = true;
164 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
166 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
167 }
168
169 ContentOperation::EndText => {
170 in_text_object = false;
171 }
172
173 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
174 state.text_matrix =
175 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
176 state.text_line_matrix =
177 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
178 }
179
180 ContentOperation::MoveText(tx, ty) => {
181 let new_matrix = multiply_matrix(
183 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
184 &state.text_line_matrix,
185 );
186 state.text_matrix = new_matrix;
187 state.text_line_matrix = new_matrix;
188 }
189
190 ContentOperation::NextLine => {
191 let new_matrix = multiply_matrix(
193 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
194 &state.text_line_matrix,
195 );
196 state.text_matrix = new_matrix;
197 state.text_line_matrix = new_matrix;
198 }
199
200 ContentOperation::ShowText(text) => {
201 if in_text_object {
202 let text_bytes = &text;
203 let decoded = self.decode_text(text_bytes, &state)?;
204
205 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
207
208 if !extracted_text.is_empty() {
210 let dx = x - last_x;
211 let dy = (y - last_y).abs();
212
213 if dy > self.options.newline_threshold {
214 extracted_text.push('\n');
215 } else if dx > self.options.space_threshold * state.font_size {
216 extracted_text.push(' ');
217 }
218 }
219
220 extracted_text.push_str(&decoded);
221
222 if self.options.preserve_layout {
223 fragments.push(TextFragment {
224 text: decoded.clone(),
225 x,
226 y,
227 width: calculate_text_width(&decoded, state.font_size),
228 height: state.font_size,
229 font_size: state.font_size,
230 });
231 }
232
233 last_x = x + calculate_text_width(&decoded, state.font_size);
235 last_y = y;
236
237 let text_width = calculate_text_width(&decoded, state.font_size);
239 let tx = text_width * state.horizontal_scale / 100.0;
240 state.text_matrix =
241 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
242 }
243 }
244
245 ContentOperation::ShowTextArray(array) => {
246 if in_text_object {
247 for item in array {
248 match item {
249 TextElement::Text(text_bytes) => {
250 let decoded = self.decode_text(&text_bytes, &state)?;
251 extracted_text.push_str(&decoded);
252
253 let text_width =
255 calculate_text_width(&decoded, state.font_size);
256 let tx = text_width * state.horizontal_scale / 100.0;
257 state.text_matrix = multiply_matrix(
258 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
259 &state.text_matrix,
260 );
261 }
262 TextElement::Spacing(adjustment) => {
263 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
265 state.text_matrix = multiply_matrix(
266 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
267 &state.text_matrix,
268 );
269 }
270 }
271 }
272 }
273 }
274
275 ContentOperation::SetFont(name, size) => {
276 state.font_name = Some(name);
277 state.font_size = size as f64;
278 }
279
280 ContentOperation::SetLeading(leading) => {
281 state.leading = leading as f64;
282 }
283
284 ContentOperation::SetCharSpacing(spacing) => {
285 state.char_space = spacing as f64;
286 }
287
288 ContentOperation::SetWordSpacing(spacing) => {
289 state.word_space = spacing as f64;
290 }
291
292 ContentOperation::SetHorizontalScaling(scale) => {
293 state.horizontal_scale = scale as f64;
294 }
295
296 ContentOperation::SetTextRise(rise) => {
297 state.text_rise = rise as f64;
298 }
299
300 ContentOperation::SetTextRenderMode(mode) => {
301 state.render_mode = mode as u8;
302 }
303
304 _ => {
305 }
307 }
308 }
309 }
310
311 Ok(ExtractedText {
312 text: extracted_text,
313 fragments,
314 })
315 }
316
317 fn decode_text(&self, text: &[u8], _state: &TextState) -> ParseResult<String> {
319 use crate::text::encoding::TextEncoding;
322
323 let encoding = TextEncoding::WinAnsiEncoding;
324 Ok(encoding.decode(text))
325 }
326}
327
328impl Default for TextExtractor {
329 fn default() -> Self {
330 Self::new()
331 }
332}
333
334fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
336 [
337 a[0] * b[0] + a[1] * b[2],
338 a[0] * b[1] + a[1] * b[3],
339 a[2] * b[0] + a[3] * b[2],
340 a[2] * b[1] + a[3] * b[3],
341 a[4] * b[0] + a[5] * b[2] + b[4],
342 a[4] * b[1] + a[5] * b[3] + b[5],
343 ]
344}
345
346fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
348 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
349 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
350 (tx, ty)
351}
352
353fn calculate_text_width(text: &str, font_size: f64) -> f64 {
355 text.len() as f64 * font_size * 0.5
357}
358
359#[cfg(test)]
360mod tests {
361 use super::*;
362
363 #[test]
364 fn test_matrix_multiplication() {
365 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
366 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
367
368 let result = multiply_matrix(&identity, &translation);
369 assert_eq!(result, translation);
370
371 let result2 = multiply_matrix(&translation, &identity);
372 assert_eq!(result2, translation);
373 }
374
375 #[test]
376 fn test_transform_point() {
377 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
378 let (x, y) = transform_point(5.0, 5.0, &translation);
379 assert_eq!(x, 15.0);
380 assert_eq!(y, 25.0);
381 }
382}