1use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::ParseResult;
9use std::io::{Read, Seek};
10
11#[derive(Debug, Clone)]
13pub struct ExtractionOptions {
14 pub preserve_layout: bool,
16 pub space_threshold: f64,
18 pub newline_threshold: f64,
20 pub sort_by_position: bool,
22 pub detect_columns: bool,
24 pub column_threshold: f64,
26 pub merge_hyphenated: bool,
28}
29
30impl Default for ExtractionOptions {
31 fn default() -> Self {
32 Self {
33 preserve_layout: false,
34 space_threshold: 0.2,
35 newline_threshold: 10.0,
36 sort_by_position: true,
37 detect_columns: false,
38 column_threshold: 50.0,
39 merge_hyphenated: true,
40 }
41 }
42}
43
44#[derive(Debug, Clone)]
46pub struct ExtractedText {
47 pub text: String,
49 pub fragments: Vec<TextFragment>,
51}
52
53#[derive(Debug, Clone)]
55pub struct TextFragment {
56 pub text: String,
58 pub x: f64,
60 pub y: f64,
62 pub width: f64,
64 pub height: f64,
66 pub font_size: f64,
68}
69
70struct TextState {
72 text_matrix: [f64; 6],
74 text_line_matrix: [f64; 6],
76 #[allow(dead_code)]
78 ctm: [f64; 6],
79 leading: f64,
81 char_space: f64,
83 word_space: f64,
85 horizontal_scale: f64,
87 text_rise: f64,
89 font_size: f64,
91 font_name: Option<String>,
93 render_mode: u8,
95}
96
97impl Default for TextState {
98 fn default() -> Self {
99 Self {
100 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
101 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
102 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
103 leading: 0.0,
104 char_space: 0.0,
105 word_space: 0.0,
106 horizontal_scale: 100.0,
107 text_rise: 0.0,
108 font_size: 0.0,
109 font_name: None,
110 render_mode: 0,
111 }
112 }
113}
114
115pub struct TextExtractor {
117 options: ExtractionOptions,
118}
119
120impl TextExtractor {
121 pub fn new() -> Self {
123 Self {
124 options: ExtractionOptions::default(),
125 }
126 }
127
128 pub fn with_options(options: ExtractionOptions) -> Self {
130 Self { options }
131 }
132
133 pub fn extract_from_document<R: Read + Seek>(
135 &self,
136 document: &PdfDocument<R>,
137 ) -> ParseResult<Vec<ExtractedText>> {
138 let page_count = document.page_count()?;
139 let mut results = Vec::new();
140
141 for i in 0..page_count {
142 let text = self.extract_from_page(document, i)?;
143 results.push(text);
144 }
145
146 Ok(results)
147 }
148
149 pub fn extract_from_page<R: Read + Seek>(
151 &self,
152 document: &PdfDocument<R>,
153 page_index: u32,
154 ) -> ParseResult<ExtractedText> {
155 let page = document.get_page(page_index)?;
157
158 let streams = page.content_streams_with_document(document)?;
160
161 let mut extracted_text = String::new();
162 let mut fragments = Vec::new();
163 let mut state = TextState::default();
164 let mut in_text_object = false;
165 let mut last_x = 0.0;
166 let mut last_y = 0.0;
167
168 for stream_data in streams {
170 let operations = ContentParser::parse_content(&stream_data)?;
171
172 for op in operations {
173 match op {
174 ContentOperation::BeginText => {
175 in_text_object = true;
176 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
178 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
179 }
180
181 ContentOperation::EndText => {
182 in_text_object = false;
183 }
184
185 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
186 state.text_matrix =
187 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
188 state.text_line_matrix =
189 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
190 }
191
192 ContentOperation::MoveText(tx, ty) => {
193 let new_matrix = multiply_matrix(
195 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
196 &state.text_line_matrix,
197 );
198 state.text_matrix = new_matrix;
199 state.text_line_matrix = new_matrix;
200 }
201
202 ContentOperation::NextLine => {
203 let new_matrix = multiply_matrix(
205 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
206 &state.text_line_matrix,
207 );
208 state.text_matrix = new_matrix;
209 state.text_line_matrix = new_matrix;
210 }
211
212 ContentOperation::ShowText(text) => {
213 if in_text_object {
214 let text_bytes = &text;
215 let decoded = self.decode_text(text_bytes, &state)?;
216
217 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
219
220 if !extracted_text.is_empty() {
222 let dx = x - last_x;
223 let dy = (y - last_y).abs();
224
225 if dy > self.options.newline_threshold {
226 extracted_text.push('\n');
227 } else if dx > self.options.space_threshold * state.font_size {
228 extracted_text.push(' ');
229 }
230 }
231
232 extracted_text.push_str(&decoded);
233
234 if self.options.preserve_layout {
235 fragments.push(TextFragment {
236 text: decoded.clone(),
237 x,
238 y,
239 width: calculate_text_width(&decoded, state.font_size),
240 height: state.font_size,
241 font_size: state.font_size,
242 });
243 }
244
245 last_x = x + calculate_text_width(&decoded, state.font_size);
247 last_y = y;
248
249 let text_width = calculate_text_width(&decoded, state.font_size);
251 let tx = text_width * state.horizontal_scale / 100.0;
252 state.text_matrix =
253 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
254 }
255 }
256
257 ContentOperation::ShowTextArray(array) => {
258 if in_text_object {
259 for item in array {
260 match item {
261 TextElement::Text(text_bytes) => {
262 let decoded = self.decode_text(&text_bytes, &state)?;
263 extracted_text.push_str(&decoded);
264
265 let text_width =
267 calculate_text_width(&decoded, state.font_size);
268 let tx = text_width * state.horizontal_scale / 100.0;
269 state.text_matrix = multiply_matrix(
270 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
271 &state.text_matrix,
272 );
273 }
274 TextElement::Spacing(adjustment) => {
275 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
277 state.text_matrix = multiply_matrix(
278 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
279 &state.text_matrix,
280 );
281 }
282 }
283 }
284 }
285 }
286
287 ContentOperation::SetFont(name, size) => {
288 state.font_name = Some(name);
289 state.font_size = size as f64;
290 }
291
292 ContentOperation::SetLeading(leading) => {
293 state.leading = leading as f64;
294 }
295
296 ContentOperation::SetCharSpacing(spacing) => {
297 state.char_space = spacing as f64;
298 }
299
300 ContentOperation::SetWordSpacing(spacing) => {
301 state.word_space = spacing as f64;
302 }
303
304 ContentOperation::SetHorizontalScaling(scale) => {
305 state.horizontal_scale = scale as f64;
306 }
307
308 ContentOperation::SetTextRise(rise) => {
309 state.text_rise = rise as f64;
310 }
311
312 ContentOperation::SetTextRenderMode(mode) => {
313 state.render_mode = mode as u8;
314 }
315
316 _ => {
317 }
319 }
320 }
321 }
322
323 if self.options.sort_by_position && !fragments.is_empty() {
325 self.sort_and_merge_fragments(&mut fragments);
326 }
327
328 if self.options.preserve_layout && !fragments.is_empty() {
330 extracted_text = self.reconstruct_text_from_fragments(&fragments);
331 }
332
333 Ok(ExtractedText {
334 text: extracted_text,
335 fragments,
336 })
337 }
338
339 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
341 fragments.sort_by(|a, b| {
343 let y_diff = (b.y - a.y).abs();
345 if y_diff < self.options.newline_threshold {
346 a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
348 } else {
349 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
351 }
352 });
353
354 if self.options.detect_columns {
356 self.detect_and_sort_columns(fragments);
357 }
358 }
359
360 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
362 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
364 let mut current_line: Vec<&mut TextFragment> = Vec::new();
365 let mut last_y = f64::INFINITY;
366
367 for fragment in fragments.iter_mut() {
368 let fragment_y = fragment.y;
369 if (last_y - fragment_y).abs() > self.options.newline_threshold
370 && !current_line.is_empty()
371 {
372 lines.push(current_line);
373 current_line = Vec::new();
374 }
375 current_line.push(fragment);
376 last_y = fragment_y;
377 }
378 if !current_line.is_empty() {
379 lines.push(current_line);
380 }
381
382 let mut column_boundaries = vec![0.0];
384 for line in &lines {
385 if line.len() > 1 {
386 for i in 0..line.len() - 1 {
387 let gap = line[i + 1].x - (line[i].x + line[i].width);
388 if gap > self.options.column_threshold {
389 let boundary = line[i].x + line[i].width + gap / 2.0;
390 if !column_boundaries
391 .iter()
392 .any(|&b| (b - boundary).abs() < 10.0)
393 {
394 column_boundaries.push(boundary);
395 }
396 }
397 }
398 }
399 }
400 column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
401
402 if column_boundaries.len() > 1 {
404 fragments.sort_by(|a, b| {
405 let col_a = column_boundaries
407 .iter()
408 .position(|&boundary| a.x < boundary)
409 .unwrap_or(column_boundaries.len())
410 - 1;
411 let col_b = column_boundaries
412 .iter()
413 .position(|&boundary| b.x < boundary)
414 .unwrap_or(column_boundaries.len())
415 - 1;
416
417 if col_a != col_b {
418 col_a.cmp(&col_b)
419 } else {
420 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
422 }
423 });
424 }
425 }
426
427 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
429 let mut result = String::new();
430 let mut last_y = f64::INFINITY;
431 let mut last_x = 0.0;
432 let mut last_line_ended_with_hyphen = false;
433
434 for fragment in fragments {
435 let y_diff = (last_y - fragment.y).abs();
437 if !result.is_empty() && y_diff > self.options.newline_threshold {
438 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
440 if result.ends_with('-') {
442 result.pop();
443 }
444 } else {
445 result.push('\n');
446 }
447 } else if !result.is_empty() {
448 let x_gap = fragment.x - last_x;
450 if x_gap > self.options.space_threshold * fragment.font_size {
451 result.push(' ');
452 }
453 }
454
455 result.push_str(&fragment.text);
456 last_line_ended_with_hyphen = fragment.text.ends_with('-');
457 last_y = fragment.y;
458 last_x = fragment.x + fragment.width;
459 }
460
461 result
462 }
463
464 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
466 use crate::text::encoding::TextEncoding;
467
468 let encoding = if let Some(ref font_name) = state.font_name {
470 match font_name.to_lowercase().as_str() {
471 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
472 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
473 name if name.contains("standard") => TextEncoding::StandardEncoding,
474 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
475 _ => {
476 if font_name.starts_with("Times")
478 || font_name.starts_with("Helvetica")
479 || font_name.starts_with("Courier")
480 {
481 TextEncoding::WinAnsiEncoding } else {
483 TextEncoding::PdfDocEncoding }
485 }
486 }
487 } else {
488 TextEncoding::WinAnsiEncoding };
490
491 Ok(encoding.decode(text))
492 }
493}
494
495impl Default for TextExtractor {
496 fn default() -> Self {
497 Self::new()
498 }
499}
500
501fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
503 [
504 a[0] * b[0] + a[1] * b[2],
505 a[0] * b[1] + a[1] * b[3],
506 a[2] * b[0] + a[3] * b[2],
507 a[2] * b[1] + a[3] * b[3],
508 a[4] * b[0] + a[5] * b[2] + b[4],
509 a[4] * b[1] + a[5] * b[3] + b[5],
510 ]
511}
512
513fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
515 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
516 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
517 (tx, ty)
518}
519
520fn calculate_text_width(text: &str, font_size: f64) -> f64 {
522 text.len() as f64 * font_size * 0.5
524}
525
526#[cfg(test)]
527mod tests {
528 use super::*;
529
530 #[test]
531 fn test_matrix_multiplication() {
532 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
533 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
534
535 let result = multiply_matrix(&identity, &translation);
536 assert_eq!(result, translation);
537
538 let result2 = multiply_matrix(&translation, &identity);
539 assert_eq!(result2, translation);
540 }
541
542 #[test]
543 fn test_transform_point() {
544 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
545 let (x, y) = transform_point(5.0, 5.0, &translation);
546 assert_eq!(x, 15.0);
547 assert_eq!(y, 25.0);
548 }
549
550 #[test]
551 fn test_extraction_options_default() {
552 let options = ExtractionOptions::default();
553 assert!(!options.preserve_layout);
554 assert_eq!(options.space_threshold, 0.2);
555 assert_eq!(options.newline_threshold, 10.0);
556 assert!(options.sort_by_position);
557 assert!(!options.detect_columns);
558 assert_eq!(options.column_threshold, 50.0);
559 assert!(options.merge_hyphenated);
560 }
561
562 #[test]
563 fn test_extraction_options_custom() {
564 let options = ExtractionOptions {
565 preserve_layout: true,
566 space_threshold: 0.5,
567 newline_threshold: 15.0,
568 sort_by_position: false,
569 detect_columns: true,
570 column_threshold: 75.0,
571 merge_hyphenated: false,
572 };
573 assert!(options.preserve_layout);
574 assert_eq!(options.space_threshold, 0.5);
575 assert_eq!(options.newline_threshold, 15.0);
576 assert!(!options.sort_by_position);
577 assert!(options.detect_columns);
578 assert_eq!(options.column_threshold, 75.0);
579 assert!(!options.merge_hyphenated);
580 }
581
582 #[test]
583 fn test_text_fragment() {
584 let fragment = TextFragment {
585 text: "Hello".to_string(),
586 x: 100.0,
587 y: 200.0,
588 width: 50.0,
589 height: 12.0,
590 font_size: 10.0,
591 };
592 assert_eq!(fragment.text, "Hello");
593 assert_eq!(fragment.x, 100.0);
594 assert_eq!(fragment.y, 200.0);
595 assert_eq!(fragment.width, 50.0);
596 assert_eq!(fragment.height, 12.0);
597 assert_eq!(fragment.font_size, 10.0);
598 }
599
600 #[test]
601 fn test_extracted_text() {
602 let fragments = vec![
603 TextFragment {
604 text: "Hello".to_string(),
605 x: 100.0,
606 y: 200.0,
607 width: 50.0,
608 height: 12.0,
609 font_size: 10.0,
610 },
611 TextFragment {
612 text: "World".to_string(),
613 x: 160.0,
614 y: 200.0,
615 width: 50.0,
616 height: 12.0,
617 font_size: 10.0,
618 },
619 ];
620
621 let extracted = ExtractedText {
622 text: "Hello World".to_string(),
623 fragments: fragments.clone(),
624 };
625
626 assert_eq!(extracted.text, "Hello World");
627 assert_eq!(extracted.fragments.len(), 2);
628 assert_eq!(extracted.fragments[0].text, "Hello");
629 assert_eq!(extracted.fragments[1].text, "World");
630 }
631
632 #[test]
633 fn test_text_state_default() {
634 let state = TextState::default();
635 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
636 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
637 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
638 assert_eq!(state.leading, 0.0);
639 assert_eq!(state.char_space, 0.0);
640 assert_eq!(state.word_space, 0.0);
641 assert_eq!(state.horizontal_scale, 100.0);
642 assert_eq!(state.text_rise, 0.0);
643 assert_eq!(state.font_size, 0.0);
644 assert!(state.font_name.is_none());
645 assert_eq!(state.render_mode, 0);
646 }
647
648 #[test]
649 fn test_matrix_operations() {
650 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
653 assert_eq!(x, 0.0);
654 assert_eq!(y, 1.0);
655
656 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
658 let (x, y) = transform_point(5.0, 5.0, &scale);
659 assert_eq!(x, 10.0);
660 assert_eq!(y, 15.0);
661
662 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
664 let (x, y) = transform_point(1.0, 1.0, &complex);
665 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
668
669 #[test]
670 fn test_text_extractor_new() {
671 let extractor = TextExtractor::new();
672 let options = extractor.options;
673 assert!(!options.preserve_layout);
674 assert_eq!(options.space_threshold, 0.2);
675 assert_eq!(options.newline_threshold, 10.0);
676 assert!(options.sort_by_position);
677 assert!(!options.detect_columns);
678 assert_eq!(options.column_threshold, 50.0);
679 assert!(options.merge_hyphenated);
680 }
681
682 #[test]
683 fn test_text_extractor_with_options() {
684 let options = ExtractionOptions {
685 preserve_layout: true,
686 space_threshold: 0.3,
687 newline_threshold: 12.0,
688 sort_by_position: false,
689 detect_columns: true,
690 column_threshold: 60.0,
691 merge_hyphenated: false,
692 };
693 let extractor = TextExtractor::with_options(options.clone());
694 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
695 assert_eq!(extractor.options.space_threshold, options.space_threshold);
696 assert_eq!(
697 extractor.options.newline_threshold,
698 options.newline_threshold
699 );
700 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
701 assert_eq!(extractor.options.detect_columns, options.detect_columns);
702 assert_eq!(extractor.options.column_threshold, options.column_threshold);
703 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
704 }
705}