1use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::ParseResult;
9use std::io::{Read, Seek};
10
11#[derive(Debug, Clone)]
13pub struct ExtractionOptions {
14 pub preserve_layout: bool,
16 pub space_threshold: f64,
18 pub newline_threshold: f64,
20}
21
22impl Default for ExtractionOptions {
23 fn default() -> Self {
24 Self {
25 preserve_layout: false,
26 space_threshold: 0.2,
27 newline_threshold: 10.0,
28 }
29 }
30}
31
32#[derive(Debug, Clone)]
34pub struct ExtractedText {
35 pub text: String,
37 pub fragments: Vec<TextFragment>,
39}
40
41#[derive(Debug, Clone)]
43pub struct TextFragment {
44 pub text: String,
46 pub x: f64,
48 pub y: f64,
50 pub width: f64,
52 pub height: f64,
54 pub font_size: f64,
56}
57
58struct TextState {
60 text_matrix: [f64; 6],
62 text_line_matrix: [f64; 6],
64 #[allow(dead_code)]
66 ctm: [f64; 6],
67 leading: f64,
69 char_space: f64,
71 word_space: f64,
73 horizontal_scale: f64,
75 text_rise: f64,
77 font_size: f64,
79 font_name: Option<String>,
81 render_mode: u8,
83}
84
85impl Default for TextState {
86 fn default() -> Self {
87 Self {
88 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
89 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
90 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
91 leading: 0.0,
92 char_space: 0.0,
93 word_space: 0.0,
94 horizontal_scale: 100.0,
95 text_rise: 0.0,
96 font_size: 0.0,
97 font_name: None,
98 render_mode: 0,
99 }
100 }
101}
102
103pub struct TextExtractor {
105 options: ExtractionOptions,
106}
107
108impl TextExtractor {
109 pub fn new() -> Self {
111 Self {
112 options: ExtractionOptions::default(),
113 }
114 }
115
116 pub fn with_options(options: ExtractionOptions) -> Self {
118 Self { options }
119 }
120
121 pub fn extract_from_document<R: Read + Seek>(
123 &self,
124 document: &PdfDocument<R>,
125 ) -> ParseResult<Vec<ExtractedText>> {
126 let page_count = document.page_count()?;
127 let mut results = Vec::new();
128
129 for i in 0..page_count {
130 let text = self.extract_from_page(document, i)?;
131 results.push(text);
132 }
133
134 Ok(results)
135 }
136
137 pub fn extract_from_page<R: Read + Seek>(
139 &self,
140 document: &PdfDocument<R>,
141 page_index: u32,
142 ) -> ParseResult<ExtractedText> {
143 let page = document.get_page(page_index)?;
145
146 let streams = page.content_streams_with_document(document)?;
148
149 let mut extracted_text = String::new();
150 let mut fragments = Vec::new();
151 let mut state = TextState::default();
152 let mut in_text_object = false;
153 let mut last_x = 0.0;
154 let mut last_y = 0.0;
155
156 for stream_data in streams {
158 let operations = ContentParser::parse_content(&stream_data)?;
159
160 for op in operations {
161 match op {
162 ContentOperation::BeginText => {
163 in_text_object = true;
164 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
166 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
167 }
168
169 ContentOperation::EndText => {
170 in_text_object = false;
171 }
172
173 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
174 state.text_matrix =
175 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
176 state.text_line_matrix =
177 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
178 }
179
180 ContentOperation::MoveText(tx, ty) => {
181 let new_matrix = multiply_matrix(
183 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
184 &state.text_line_matrix,
185 );
186 state.text_matrix = new_matrix;
187 state.text_line_matrix = new_matrix;
188 }
189
190 ContentOperation::NextLine => {
191 let new_matrix = multiply_matrix(
193 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
194 &state.text_line_matrix,
195 );
196 state.text_matrix = new_matrix;
197 state.text_line_matrix = new_matrix;
198 }
199
200 ContentOperation::ShowText(text) => {
201 if in_text_object {
202 let text_bytes = &text;
203 let decoded = self.decode_text(text_bytes, &state)?;
204
205 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
207
208 if !extracted_text.is_empty() {
210 let dx = x - last_x;
211 let dy = (y - last_y).abs();
212
213 if dy > self.options.newline_threshold {
214 extracted_text.push('\n');
215 } else if dx > self.options.space_threshold * state.font_size {
216 extracted_text.push(' ');
217 }
218 }
219
220 extracted_text.push_str(&decoded);
221
222 if self.options.preserve_layout {
223 fragments.push(TextFragment {
224 text: decoded.clone(),
225 x,
226 y,
227 width: calculate_text_width(&decoded, state.font_size),
228 height: state.font_size,
229 font_size: state.font_size,
230 });
231 }
232
233 last_x = x + calculate_text_width(&decoded, state.font_size);
235 last_y = y;
236
237 let text_width = calculate_text_width(&decoded, state.font_size);
239 let tx = text_width * state.horizontal_scale / 100.0;
240 state.text_matrix =
241 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
242 }
243 }
244
245 ContentOperation::ShowTextArray(array) => {
246 if in_text_object {
247 for item in array {
248 match item {
249 TextElement::Text(text_bytes) => {
250 let decoded = self.decode_text(&text_bytes, &state)?;
251 extracted_text.push_str(&decoded);
252
253 let text_width =
255 calculate_text_width(&decoded, state.font_size);
256 let tx = text_width * state.horizontal_scale / 100.0;
257 state.text_matrix = multiply_matrix(
258 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
259 &state.text_matrix,
260 );
261 }
262 TextElement::Spacing(adjustment) => {
263 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
265 state.text_matrix = multiply_matrix(
266 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
267 &state.text_matrix,
268 );
269 }
270 }
271 }
272 }
273 }
274
275 ContentOperation::SetFont(name, size) => {
276 state.font_name = Some(name);
277 state.font_size = size as f64;
278 }
279
280 ContentOperation::SetLeading(leading) => {
281 state.leading = leading as f64;
282 }
283
284 ContentOperation::SetCharSpacing(spacing) => {
285 state.char_space = spacing as f64;
286 }
287
288 ContentOperation::SetWordSpacing(spacing) => {
289 state.word_space = spacing as f64;
290 }
291
292 ContentOperation::SetHorizontalScaling(scale) => {
293 state.horizontal_scale = scale as f64;
294 }
295
296 ContentOperation::SetTextRise(rise) => {
297 state.text_rise = rise as f64;
298 }
299
300 ContentOperation::SetTextRenderMode(mode) => {
301 state.render_mode = mode as u8;
302 }
303
304 _ => {
305 }
307 }
308 }
309 }
310
311 Ok(ExtractedText {
312 text: extracted_text,
313 fragments,
314 })
315 }
316
317 fn decode_text(&self, text: &[u8], _state: &TextState) -> ParseResult<String> {
319 use crate::text::encoding::TextEncoding;
322
323 let encoding = TextEncoding::WinAnsiEncoding;
324 Ok(encoding.decode(text))
325 }
326}
327
328impl Default for TextExtractor {
329 fn default() -> Self {
330 Self::new()
331 }
332}
333
334fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
336 [
337 a[0] * b[0] + a[1] * b[2],
338 a[0] * b[1] + a[1] * b[3],
339 a[2] * b[0] + a[3] * b[2],
340 a[2] * b[1] + a[3] * b[3],
341 a[4] * b[0] + a[5] * b[2] + b[4],
342 a[4] * b[1] + a[5] * b[3] + b[5],
343 ]
344}
345
346fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
348 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
349 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
350 (tx, ty)
351}
352
353fn calculate_text_width(text: &str, font_size: f64) -> f64 {
355 text.len() as f64 * font_size * 0.5
357}
358
359#[cfg(test)]
360mod tests {
361 use super::*;
362
363 #[test]
364 fn test_matrix_multiplication() {
365 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
366 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
367
368 let result = multiply_matrix(&identity, &translation);
369 assert_eq!(result, translation);
370
371 let result2 = multiply_matrix(&translation, &identity);
372 assert_eq!(result2, translation);
373 }
374
375 #[test]
376 fn test_transform_point() {
377 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
378 let (x, y) = transform_point(5.0, 5.0, &translation);
379 assert_eq!(x, 15.0);
380 assert_eq!(y, 25.0);
381 }
382
383 #[test]
384 fn test_extraction_options_default() {
385 let options = ExtractionOptions::default();
386 assert!(!options.preserve_layout);
387 assert_eq!(options.space_threshold, 0.2);
388 assert_eq!(options.newline_threshold, 10.0);
389 }
390
391 #[test]
392 fn test_extraction_options_custom() {
393 let options = ExtractionOptions {
394 preserve_layout: true,
395 space_threshold: 0.5,
396 newline_threshold: 15.0,
397 };
398 assert!(options.preserve_layout);
399 assert_eq!(options.space_threshold, 0.5);
400 assert_eq!(options.newline_threshold, 15.0);
401 }
402
403 #[test]
404 fn test_text_fragment() {
405 let fragment = TextFragment {
406 text: "Hello".to_string(),
407 x: 100.0,
408 y: 200.0,
409 width: 50.0,
410 height: 12.0,
411 font_size: 10.0,
412 };
413 assert_eq!(fragment.text, "Hello");
414 assert_eq!(fragment.x, 100.0);
415 assert_eq!(fragment.y, 200.0);
416 assert_eq!(fragment.width, 50.0);
417 assert_eq!(fragment.height, 12.0);
418 assert_eq!(fragment.font_size, 10.0);
419 }
420
421 #[test]
422 fn test_extracted_text() {
423 let fragments = vec![
424 TextFragment {
425 text: "Hello".to_string(),
426 x: 100.0,
427 y: 200.0,
428 width: 50.0,
429 height: 12.0,
430 font_size: 10.0,
431 },
432 TextFragment {
433 text: "World".to_string(),
434 x: 160.0,
435 y: 200.0,
436 width: 50.0,
437 height: 12.0,
438 font_size: 10.0,
439 },
440 ];
441
442 let extracted = ExtractedText {
443 text: "Hello World".to_string(),
444 fragments: fragments.clone(),
445 };
446
447 assert_eq!(extracted.text, "Hello World");
448 assert_eq!(extracted.fragments.len(), 2);
449 assert_eq!(extracted.fragments[0].text, "Hello");
450 assert_eq!(extracted.fragments[1].text, "World");
451 }
452
453 #[test]
454 fn test_text_state_default() {
455 let state = TextState::default();
456 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
457 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
458 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
459 assert_eq!(state.leading, 0.0);
460 assert_eq!(state.char_space, 0.0);
461 assert_eq!(state.word_space, 0.0);
462 assert_eq!(state.horizontal_scale, 100.0);
463 assert_eq!(state.text_rise, 0.0);
464 assert_eq!(state.font_size, 0.0);
465 assert!(state.font_name.is_none());
466 assert_eq!(state.render_mode, 0);
467 }
468
469 #[test]
470 fn test_matrix_operations() {
471 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
474 assert_eq!(x, 0.0);
475 assert_eq!(y, 1.0);
476
477 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
479 let (x, y) = transform_point(5.0, 5.0, &scale);
480 assert_eq!(x, 10.0);
481 assert_eq!(y, 15.0);
482
483 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
485 let (x, y) = transform_point(1.0, 1.0, &complex);
486 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
489
490 #[test]
491 fn test_text_extractor_new() {
492 let extractor = TextExtractor::new();
493 let options = extractor.options;
494 assert!(!options.preserve_layout);
495 assert_eq!(options.space_threshold, 0.2);
496 assert_eq!(options.newline_threshold, 10.0);
497 }
498
499 #[test]
500 fn test_text_extractor_with_options() {
501 let options = ExtractionOptions {
502 preserve_layout: true,
503 space_threshold: 0.3,
504 newline_threshold: 12.0,
505 };
506 let extractor = TextExtractor::with_options(options.clone());
507 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
508 assert_eq!(extractor.options.space_threshold, options.space_threshold);
509 assert_eq!(extractor.options.newline_threshold, options.newline_threshold);
510 }
511}