1use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::objects::PdfObject;
9use crate::parser::page_tree::ParsedPage;
10use crate::parser::ParseResult;
11use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
12use std::collections::HashMap;
13use std::io::{Read, Seek};
14
15#[derive(Debug, Clone)]
17pub struct ExtractionOptions {
18 pub preserve_layout: bool,
20 pub space_threshold: f64,
22 pub newline_threshold: f64,
24 pub sort_by_position: bool,
26 pub detect_columns: bool,
28 pub column_threshold: f64,
30 pub merge_hyphenated: bool,
32}
33
34impl Default for ExtractionOptions {
35 fn default() -> Self {
36 Self {
37 preserve_layout: false,
38 space_threshold: 0.2,
39 newline_threshold: 10.0,
40 sort_by_position: true,
41 detect_columns: false,
42 column_threshold: 50.0,
43 merge_hyphenated: true,
44 }
45 }
46}
47
48#[derive(Debug, Clone)]
50pub struct ExtractedText {
51 pub text: String,
53 pub fragments: Vec<TextFragment>,
55}
56
57#[derive(Debug, Clone)]
59pub struct TextFragment {
60 pub text: String,
62 pub x: f64,
64 pub y: f64,
66 pub width: f64,
68 pub height: f64,
70 pub font_size: f64,
72}
73
74struct TextState {
76 text_matrix: [f64; 6],
78 text_line_matrix: [f64; 6],
80 #[allow(dead_code)]
82 ctm: [f64; 6],
83 leading: f64,
85 char_space: f64,
87 word_space: f64,
89 horizontal_scale: f64,
91 text_rise: f64,
93 font_size: f64,
95 font_name: Option<String>,
97 render_mode: u8,
99}
100
101impl Default for TextState {
102 fn default() -> Self {
103 Self {
104 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
105 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
106 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
107 leading: 0.0,
108 char_space: 0.0,
109 word_space: 0.0,
110 horizontal_scale: 100.0,
111 text_rise: 0.0,
112 font_size: 0.0,
113 font_name: None,
114 render_mode: 0,
115 }
116 }
117}
118
119pub struct TextExtractor {
121 options: ExtractionOptions,
122 font_cache: HashMap<String, FontInfo>,
124}
125
126impl TextExtractor {
127 pub fn new() -> Self {
129 Self {
130 options: ExtractionOptions::default(),
131 font_cache: HashMap::new(),
132 }
133 }
134
135 pub fn with_options(options: ExtractionOptions) -> Self {
137 Self {
138 options,
139 font_cache: HashMap::new(),
140 }
141 }
142
143 pub fn extract_from_document<R: Read + Seek>(
145 &mut self,
146 document: &PdfDocument<R>,
147 ) -> ParseResult<Vec<ExtractedText>> {
148 let page_count = document.page_count()?;
149 let mut results = Vec::new();
150
151 for i in 0..page_count {
152 let text = self.extract_from_page(document, i)?;
153 results.push(text);
154 }
155
156 Ok(results)
157 }
158
159 pub fn extract_from_page<R: Read + Seek>(
161 &mut self,
162 document: &PdfDocument<R>,
163 page_index: u32,
164 ) -> ParseResult<ExtractedText> {
165 let page = document.get_page(page_index)?;
167
168 self.extract_font_resources(&page, document)?;
170
171 let streams = page.content_streams_with_document(document)?;
173
174 let mut extracted_text = String::new();
175 let mut fragments = Vec::new();
176 let mut state = TextState::default();
177 let mut in_text_object = false;
178 let mut last_x = 0.0;
179 let mut last_y = 0.0;
180
181 for stream_data in streams {
183 let operations = match ContentParser::parse_content(&stream_data) {
184 Ok(ops) => ops,
185 Err(e) => {
186 eprintln!("Warning: Failed to parse content stream, skipping: {}", e);
188 continue;
189 }
190 };
191
192 for op in operations {
193 match op {
194 ContentOperation::BeginText => {
195 in_text_object = true;
196 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
198 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
199 }
200
201 ContentOperation::EndText => {
202 in_text_object = false;
203 }
204
205 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
206 state.text_matrix =
207 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
208 state.text_line_matrix =
209 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
210 }
211
212 ContentOperation::MoveText(tx, ty) => {
213 let new_matrix = multiply_matrix(
215 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
216 &state.text_line_matrix,
217 );
218 state.text_matrix = new_matrix;
219 state.text_line_matrix = new_matrix;
220 }
221
222 ContentOperation::NextLine => {
223 let new_matrix = multiply_matrix(
225 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
226 &state.text_line_matrix,
227 );
228 state.text_matrix = new_matrix;
229 state.text_line_matrix = new_matrix;
230 }
231
232 ContentOperation::ShowText(text) => {
233 if in_text_object {
234 let text_bytes = &text;
235 let decoded = self.decode_text(text_bytes, &state)?;
236
237 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
239
240 if !extracted_text.is_empty() {
242 let dx = x - last_x;
243 let dy = (y - last_y).abs();
244
245 if dy > self.options.newline_threshold {
246 extracted_text.push('\n');
247 } else if dx > self.options.space_threshold * state.font_size {
248 extracted_text.push(' ');
249 }
250 }
251
252 extracted_text.push_str(&decoded);
253
254 if self.options.preserve_layout {
255 fragments.push(TextFragment {
256 text: decoded.clone(),
257 x,
258 y,
259 width: calculate_text_width(&decoded, state.font_size),
260 height: state.font_size,
261 font_size: state.font_size,
262 });
263 }
264
265 last_x = x + calculate_text_width(&decoded, state.font_size);
267 last_y = y;
268
269 let text_width = calculate_text_width(&decoded, state.font_size);
271 let tx = text_width * state.horizontal_scale / 100.0;
272 state.text_matrix =
273 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
274 }
275 }
276
277 ContentOperation::ShowTextArray(array) => {
278 if in_text_object {
279 for item in array {
280 match item {
281 TextElement::Text(text_bytes) => {
282 let decoded = self.decode_text(&text_bytes, &state)?;
283 extracted_text.push_str(&decoded);
284
285 let text_width =
287 calculate_text_width(&decoded, state.font_size);
288 let tx = text_width * state.horizontal_scale / 100.0;
289 state.text_matrix = multiply_matrix(
290 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
291 &state.text_matrix,
292 );
293 }
294 TextElement::Spacing(adjustment) => {
295 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
297 state.text_matrix = multiply_matrix(
298 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
299 &state.text_matrix,
300 );
301 }
302 }
303 }
304 }
305 }
306
307 ContentOperation::SetFont(name, size) => {
308 state.font_name = Some(name);
309 state.font_size = size as f64;
310 }
311
312 ContentOperation::SetLeading(leading) => {
313 state.leading = leading as f64;
314 }
315
316 ContentOperation::SetCharSpacing(spacing) => {
317 state.char_space = spacing as f64;
318 }
319
320 ContentOperation::SetWordSpacing(spacing) => {
321 state.word_space = spacing as f64;
322 }
323
324 ContentOperation::SetHorizontalScaling(scale) => {
325 state.horizontal_scale = scale as f64;
326 }
327
328 ContentOperation::SetTextRise(rise) => {
329 state.text_rise = rise as f64;
330 }
331
332 ContentOperation::SetTextRenderMode(mode) => {
333 state.render_mode = mode as u8;
334 }
335
336 _ => {
337 }
339 }
340 }
341 }
342
343 if self.options.sort_by_position && !fragments.is_empty() {
345 self.sort_and_merge_fragments(&mut fragments);
346 }
347
348 if self.options.preserve_layout && !fragments.is_empty() {
350 extracted_text = self.reconstruct_text_from_fragments(&fragments);
351 }
352
353 Ok(ExtractedText {
354 text: extracted_text,
355 fragments,
356 })
357 }
358
359 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
361 fragments.sort_by(|a, b| {
363 let y_diff = (b.y - a.y).abs();
365 if y_diff < self.options.newline_threshold {
366 a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
368 } else {
369 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
371 }
372 });
373
374 if self.options.detect_columns {
376 self.detect_and_sort_columns(fragments);
377 }
378 }
379
380 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
382 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
384 let mut current_line: Vec<&mut TextFragment> = Vec::new();
385 let mut last_y = f64::INFINITY;
386
387 for fragment in fragments.iter_mut() {
388 let fragment_y = fragment.y;
389 if (last_y - fragment_y).abs() > self.options.newline_threshold
390 && !current_line.is_empty()
391 {
392 lines.push(current_line);
393 current_line = Vec::new();
394 }
395 current_line.push(fragment);
396 last_y = fragment_y;
397 }
398 if !current_line.is_empty() {
399 lines.push(current_line);
400 }
401
402 let mut column_boundaries = vec![0.0];
404 for line in &lines {
405 if line.len() > 1 {
406 for i in 0..line.len() - 1 {
407 let gap = line[i + 1].x - (line[i].x + line[i].width);
408 if gap > self.options.column_threshold {
409 let boundary = line[i].x + line[i].width + gap / 2.0;
410 if !column_boundaries
411 .iter()
412 .any(|&b| (b - boundary).abs() < 10.0)
413 {
414 column_boundaries.push(boundary);
415 }
416 }
417 }
418 }
419 }
420 column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
421
422 if column_boundaries.len() > 1 {
424 fragments.sort_by(|a, b| {
425 let col_a = column_boundaries
427 .iter()
428 .position(|&boundary| a.x < boundary)
429 .unwrap_or(column_boundaries.len())
430 - 1;
431 let col_b = column_boundaries
432 .iter()
433 .position(|&boundary| b.x < boundary)
434 .unwrap_or(column_boundaries.len())
435 - 1;
436
437 if col_a != col_b {
438 col_a.cmp(&col_b)
439 } else {
440 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
442 }
443 });
444 }
445 }
446
447 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
449 let mut result = String::new();
450 let mut last_y = f64::INFINITY;
451 let mut last_x = 0.0;
452 let mut last_line_ended_with_hyphen = false;
453
454 for fragment in fragments {
455 let y_diff = (last_y - fragment.y).abs();
457 if !result.is_empty() && y_diff > self.options.newline_threshold {
458 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
460 if result.ends_with('-') {
462 result.pop();
463 }
464 } else {
465 result.push('\n');
466 }
467 } else if !result.is_empty() {
468 let x_gap = fragment.x - last_x;
470 if x_gap > self.options.space_threshold * fragment.font_size {
471 result.push(' ');
472 }
473 }
474
475 result.push_str(&fragment.text);
476 last_line_ended_with_hyphen = fragment.text.ends_with('-');
477 last_y = fragment.y;
478 last_x = fragment.x + fragment.width;
479 }
480
481 result
482 }
483
484 fn extract_font_resources<R: Read + Seek>(
486 &mut self,
487 page: &ParsedPage,
488 document: &PdfDocument<R>,
489 ) -> ParseResult<()> {
490 self.font_cache.clear();
492
493 if let Some(resources) = page.get_resources() {
495 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
496 for (font_name, font_obj) in font_dict.0.iter() {
498 if let Some(font_ref) = font_obj.as_reference() {
499 if let Ok(PdfObject::Dictionary(font_dict)) =
500 document.get_object(font_ref.0, font_ref.1)
501 {
502 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
504
505 if let Ok(font_info) =
506 cmap_extractor.extract_font_info(&font_dict, document)
507 {
508 self.font_cache.insert(font_name.0.clone(), font_info);
509 tracing::debug!(
510 "Cached font: {} -> {:?}",
511 font_name.0,
512 self.font_cache.get(&font_name.0)
513 );
514 }
515 }
516 }
517 }
518 }
519 }
520
521 Ok(())
522 }
523
524 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
526 use crate::text::encoding::TextEncoding;
527
528 if let Some(ref font_name) = state.font_name {
530 if let Some(font_info) = self.font_cache.get(font_name) {
531 let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
533
534 if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
536 tracing::debug!(
537 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
538 font_name,
539 text,
540 decoded
541 );
542 return Ok(decoded);
543 }
544
545 tracing::debug!(
546 "CMap decoding failed for font {}, falling back to encoding",
547 font_name
548 );
549 }
550 }
551
552 let encoding = if let Some(ref font_name) = state.font_name {
554 match font_name.to_lowercase().as_str() {
555 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
556 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
557 name if name.contains("standard") => TextEncoding::StandardEncoding,
558 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
559 _ => {
560 if font_name.starts_with("Times")
562 || font_name.starts_with("Helvetica")
563 || font_name.starts_with("Courier")
564 {
565 TextEncoding::WinAnsiEncoding } else {
567 TextEncoding::PdfDocEncoding }
569 }
570 }
571 } else {
572 TextEncoding::WinAnsiEncoding };
574
575 let fallback_result = encoding.decode(text);
576 tracing::debug!(
577 "Fallback encoding decoding: {:?} -> \"{}\"",
578 text,
579 fallback_result
580 );
581 Ok(fallback_result)
582 }
583}
584
585impl Default for TextExtractor {
586 fn default() -> Self {
587 Self::new()
588 }
589}
590
591fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
593 [
594 a[0] * b[0] + a[1] * b[2],
595 a[0] * b[1] + a[1] * b[3],
596 a[2] * b[0] + a[3] * b[2],
597 a[2] * b[1] + a[3] * b[3],
598 a[4] * b[0] + a[5] * b[2] + b[4],
599 a[4] * b[1] + a[5] * b[3] + b[5],
600 ]
601}
602
603fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
605 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
606 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
607 (tx, ty)
608}
609
610fn calculate_text_width(text: &str, font_size: f64) -> f64 {
612 text.len() as f64 * font_size * 0.5
614}
615
616#[cfg(test)]
617mod tests {
618 use super::*;
619
620 #[test]
621 fn test_matrix_multiplication() {
622 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
623 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
624
625 let result = multiply_matrix(&identity, &translation);
626 assert_eq!(result, translation);
627
628 let result2 = multiply_matrix(&translation, &identity);
629 assert_eq!(result2, translation);
630 }
631
632 #[test]
633 fn test_transform_point() {
634 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
635 let (x, y) = transform_point(5.0, 5.0, &translation);
636 assert_eq!(x, 15.0);
637 assert_eq!(y, 25.0);
638 }
639
640 #[test]
641 fn test_extraction_options_default() {
642 let options = ExtractionOptions::default();
643 assert!(!options.preserve_layout);
644 assert_eq!(options.space_threshold, 0.2);
645 assert_eq!(options.newline_threshold, 10.0);
646 assert!(options.sort_by_position);
647 assert!(!options.detect_columns);
648 assert_eq!(options.column_threshold, 50.0);
649 assert!(options.merge_hyphenated);
650 }
651
652 #[test]
653 fn test_extraction_options_custom() {
654 let options = ExtractionOptions {
655 preserve_layout: true,
656 space_threshold: 0.5,
657 newline_threshold: 15.0,
658 sort_by_position: false,
659 detect_columns: true,
660 column_threshold: 75.0,
661 merge_hyphenated: false,
662 };
663 assert!(options.preserve_layout);
664 assert_eq!(options.space_threshold, 0.5);
665 assert_eq!(options.newline_threshold, 15.0);
666 assert!(!options.sort_by_position);
667 assert!(options.detect_columns);
668 assert_eq!(options.column_threshold, 75.0);
669 assert!(!options.merge_hyphenated);
670 }
671
672 #[test]
673 fn test_text_fragment() {
674 let fragment = TextFragment {
675 text: "Hello".to_string(),
676 x: 100.0,
677 y: 200.0,
678 width: 50.0,
679 height: 12.0,
680 font_size: 10.0,
681 };
682 assert_eq!(fragment.text, "Hello");
683 assert_eq!(fragment.x, 100.0);
684 assert_eq!(fragment.y, 200.0);
685 assert_eq!(fragment.width, 50.0);
686 assert_eq!(fragment.height, 12.0);
687 assert_eq!(fragment.font_size, 10.0);
688 }
689
690 #[test]
691 fn test_extracted_text() {
692 let fragments = vec![
693 TextFragment {
694 text: "Hello".to_string(),
695 x: 100.0,
696 y: 200.0,
697 width: 50.0,
698 height: 12.0,
699 font_size: 10.0,
700 },
701 TextFragment {
702 text: "World".to_string(),
703 x: 160.0,
704 y: 200.0,
705 width: 50.0,
706 height: 12.0,
707 font_size: 10.0,
708 },
709 ];
710
711 let extracted = ExtractedText {
712 text: "Hello World".to_string(),
713 fragments: fragments.clone(),
714 };
715
716 assert_eq!(extracted.text, "Hello World");
717 assert_eq!(extracted.fragments.len(), 2);
718 assert_eq!(extracted.fragments[0].text, "Hello");
719 assert_eq!(extracted.fragments[1].text, "World");
720 }
721
722 #[test]
723 fn test_text_state_default() {
724 let state = TextState::default();
725 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
726 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
727 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
728 assert_eq!(state.leading, 0.0);
729 assert_eq!(state.char_space, 0.0);
730 assert_eq!(state.word_space, 0.0);
731 assert_eq!(state.horizontal_scale, 100.0);
732 assert_eq!(state.text_rise, 0.0);
733 assert_eq!(state.font_size, 0.0);
734 assert!(state.font_name.is_none());
735 assert_eq!(state.render_mode, 0);
736 }
737
738 #[test]
739 fn test_matrix_operations() {
740 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
743 assert_eq!(x, 0.0);
744 assert_eq!(y, 1.0);
745
746 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
748 let (x, y) = transform_point(5.0, 5.0, &scale);
749 assert_eq!(x, 10.0);
750 assert_eq!(y, 15.0);
751
752 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
754 let (x, y) = transform_point(1.0, 1.0, &complex);
755 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
758
759 #[test]
760 fn test_text_extractor_new() {
761 let extractor = TextExtractor::new();
762 let options = extractor.options;
763 assert!(!options.preserve_layout);
764 assert_eq!(options.space_threshold, 0.2);
765 assert_eq!(options.newline_threshold, 10.0);
766 assert!(options.sort_by_position);
767 assert!(!options.detect_columns);
768 assert_eq!(options.column_threshold, 50.0);
769 assert!(options.merge_hyphenated);
770 }
771
772 #[test]
773 fn test_text_extractor_with_options() {
774 let options = ExtractionOptions {
775 preserve_layout: true,
776 space_threshold: 0.3,
777 newline_threshold: 12.0,
778 sort_by_position: false,
779 detect_columns: true,
780 column_threshold: 60.0,
781 merge_hyphenated: false,
782 };
783 let extractor = TextExtractor::with_options(options.clone());
784 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
785 assert_eq!(extractor.options.space_threshold, options.space_threshold);
786 assert_eq!(
787 extractor.options.newline_threshold,
788 options.newline_threshold
789 );
790 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
791 assert_eq!(extractor.options.detect_columns, options.detect_columns);
792 assert_eq!(extractor.options.column_threshold, options.column_threshold);
793 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
794 }
795}