1use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::objects::PdfObject;
9use crate::parser::page_tree::ParsedPage;
10use crate::parser::ParseResult;
11use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
12use std::collections::HashMap;
13use std::io::{Read, Seek};
14
15#[derive(Debug, Clone)]
17pub struct ExtractionOptions {
18 pub preserve_layout: bool,
20 pub space_threshold: f64,
22 pub newline_threshold: f64,
24 pub sort_by_position: bool,
26 pub detect_columns: bool,
28 pub column_threshold: f64,
30 pub merge_hyphenated: bool,
32}
33
34impl Default for ExtractionOptions {
35 fn default() -> Self {
36 Self {
37 preserve_layout: false,
38 space_threshold: 0.2,
39 newline_threshold: 10.0,
40 sort_by_position: true,
41 detect_columns: false,
42 column_threshold: 50.0,
43 merge_hyphenated: true,
44 }
45 }
46}
47
48#[derive(Debug, Clone)]
50pub struct ExtractedText {
51 pub text: String,
53 pub fragments: Vec<TextFragment>,
55}
56
57#[derive(Debug, Clone)]
59pub struct TextFragment {
60 pub text: String,
62 pub x: f64,
64 pub y: f64,
66 pub width: f64,
68 pub height: f64,
70 pub font_size: f64,
72 pub font_name: Option<String>,
74 pub is_bold: bool,
76 pub is_italic: bool,
78}
79
80struct TextState {
82 text_matrix: [f64; 6],
84 text_line_matrix: [f64; 6],
86 ctm: [f64; 6],
88 leading: f64,
90 char_space: f64,
92 word_space: f64,
94 horizontal_scale: f64,
96 text_rise: f64,
98 font_size: f64,
100 font_name: Option<String>,
102 render_mode: u8,
104}
105
106impl Default for TextState {
107 fn default() -> Self {
108 Self {
109 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
110 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
111 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
112 leading: 0.0,
113 char_space: 0.0,
114 word_space: 0.0,
115 horizontal_scale: 100.0,
116 text_rise: 0.0,
117 font_size: 0.0,
118 font_name: None,
119 render_mode: 0,
120 }
121 }
122}
123
124pub fn parse_font_style(font_name: &str) -> (bool, bool) {
145 let name_lower = font_name.to_lowercase();
146
147 let is_bold = name_lower.contains("bold")
149 || name_lower.contains("-b")
150 || name_lower.contains(" b ")
151 || name_lower.ends_with(" b");
152
153 let is_italic = name_lower.contains("italic")
155 || name_lower.contains("oblique")
156 || name_lower.contains("-i")
157 || name_lower.contains(" i ")
158 || name_lower.ends_with(" i");
159
160 (is_bold, is_italic)
161}
162
163pub struct TextExtractor {
165 options: ExtractionOptions,
166 font_cache: HashMap<String, FontInfo>,
168}
169
170impl TextExtractor {
171 pub fn new() -> Self {
173 Self {
174 options: ExtractionOptions::default(),
175 font_cache: HashMap::new(),
176 }
177 }
178
179 pub fn with_options(options: ExtractionOptions) -> Self {
181 Self {
182 options,
183 font_cache: HashMap::new(),
184 }
185 }
186
187 pub fn extract_from_document<R: Read + Seek>(
189 &mut self,
190 document: &PdfDocument<R>,
191 ) -> ParseResult<Vec<ExtractedText>> {
192 let page_count = document.page_count()?;
193 let mut results = Vec::new();
194
195 for i in 0..page_count {
196 let text = self.extract_from_page(document, i)?;
197 results.push(text);
198 }
199
200 Ok(results)
201 }
202
203 pub fn extract_from_page<R: Read + Seek>(
205 &mut self,
206 document: &PdfDocument<R>,
207 page_index: u32,
208 ) -> ParseResult<ExtractedText> {
209 let page = document.get_page(page_index)?;
211
212 self.extract_font_resources(&page, document)?;
214
215 let streams = page.content_streams_with_document(document)?;
217
218 let mut extracted_text = String::new();
219 let mut fragments = Vec::new();
220 let mut state = TextState::default();
221 let mut in_text_object = false;
222 let mut last_x = 0.0;
223 let mut last_y = 0.0;
224
225 for (stream_idx, stream_data) in streams.iter().enumerate() {
227 let operations = match ContentParser::parse_content(stream_data) {
228 Ok(ops) => ops,
229 Err(e) => {
230 eprintln!(
232 "Warning: Failed to parse content stream on page {}, stream {}/{}",
233 page_index + 1,
234 stream_idx + 1,
235 streams.len()
236 );
237 eprintln!(" Error: {}", e);
238 eprintln!(" Stream size: {} bytes", stream_data.len());
239
240 let preview_len = stream_data.len().min(100);
242 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
243 eprintln!(
244 " Stream preview (first {} bytes): {:?}",
245 preview_len,
246 preview.chars().take(80).collect::<String>()
247 );
248
249 continue;
251 }
252 };
253
254 for op in operations {
255 match op {
256 ContentOperation::BeginText => {
257 in_text_object = true;
258 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
260 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
261 }
262
263 ContentOperation::EndText => {
264 in_text_object = false;
265 }
266
267 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
268 state.text_matrix =
269 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
270 state.text_line_matrix =
271 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
272 }
273
274 ContentOperation::MoveText(tx, ty) => {
275 let new_matrix = multiply_matrix(
277 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
278 &state.text_line_matrix,
279 );
280 state.text_matrix = new_matrix;
281 state.text_line_matrix = new_matrix;
282 }
283
284 ContentOperation::NextLine => {
285 let new_matrix = multiply_matrix(
287 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
288 &state.text_line_matrix,
289 );
290 state.text_matrix = new_matrix;
291 state.text_line_matrix = new_matrix;
292 }
293
294 ContentOperation::ShowText(text) => {
295 if in_text_object {
296 let text_bytes = &text;
297 let decoded = self.decode_text(text_bytes, &state)?;
298
299 let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
302 let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
303
304 if !extracted_text.is_empty() {
306 let dx = x - last_x;
307 let dy = (y - last_y).abs();
308
309 if dy > self.options.newline_threshold {
310 extracted_text.push('\n');
311 } else if dx > self.options.space_threshold * state.font_size {
312 extracted_text.push(' ');
313 }
314 }
315
316 extracted_text.push_str(&decoded);
317
318 let font_info = state
320 .font_name
321 .as_ref()
322 .and_then(|name| self.font_cache.get(name));
323
324 if self.options.preserve_layout {
325 let (is_bold, is_italic) = state
327 .font_name
328 .as_ref()
329 .map(|name| parse_font_style(name))
330 .unwrap_or((false, false));
331
332 fragments.push(TextFragment {
333 text: decoded.clone(),
334 x,
335 y,
336 width: calculate_text_width(
337 &decoded,
338 state.font_size,
339 font_info,
340 ),
341 height: state.font_size,
342 font_size: state.font_size,
343 font_name: state.font_name.clone(),
344 is_bold,
345 is_italic,
346 });
347 }
348
349 last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
351 last_y = y;
352
353 let text_width =
355 calculate_text_width(&decoded, state.font_size, font_info);
356 let tx = text_width * state.horizontal_scale / 100.0;
357 state.text_matrix =
358 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
359 }
360 }
361
362 ContentOperation::ShowTextArray(array) => {
363 if in_text_object {
364 let font_info = state
366 .font_name
367 .as_ref()
368 .and_then(|name| self.font_cache.get(name));
369
370 for item in array {
371 match item {
372 TextElement::Text(text_bytes) => {
373 let decoded = self.decode_text(&text_bytes, &state)?;
374 extracted_text.push_str(&decoded);
375
376 let text_width = calculate_text_width(
378 &decoded,
379 state.font_size,
380 font_info,
381 );
382 let tx = text_width * state.horizontal_scale / 100.0;
383 state.text_matrix = multiply_matrix(
384 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
385 &state.text_matrix,
386 );
387 }
388 TextElement::Spacing(adjustment) => {
389 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
391 state.text_matrix = multiply_matrix(
392 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
393 &state.text_matrix,
394 );
395 }
396 }
397 }
398 }
399 }
400
401 ContentOperation::SetFont(name, size) => {
402 state.font_name = Some(name);
403 state.font_size = size as f64;
404 }
405
406 ContentOperation::SetLeading(leading) => {
407 state.leading = leading as f64;
408 }
409
410 ContentOperation::SetCharSpacing(spacing) => {
411 state.char_space = spacing as f64;
412 }
413
414 ContentOperation::SetWordSpacing(spacing) => {
415 state.word_space = spacing as f64;
416 }
417
418 ContentOperation::SetHorizontalScaling(scale) => {
419 state.horizontal_scale = scale as f64;
420 }
421
422 ContentOperation::SetTextRise(rise) => {
423 state.text_rise = rise as f64;
424 }
425
426 ContentOperation::SetTextRenderMode(mode) => {
427 state.render_mode = mode as u8;
428 }
429
430 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
431 let [a0, b0, c0, d0, e0, f0] = state.ctm;
433 let a = a as f64;
434 let b = b as f64;
435 let c = c as f64;
436 let d = d as f64;
437 let e = e as f64;
438 let f = f as f64;
439 state.ctm = [
440 a * a0 + b * c0,
441 a * b0 + b * d0,
442 c * a0 + d * c0,
443 c * b0 + d * d0,
444 e * a0 + f * c0 + e0,
445 e * b0 + f * d0 + f0,
446 ];
447 }
448
449 _ => {
450 }
452 }
453 }
454 }
455
456 if self.options.sort_by_position && !fragments.is_empty() {
458 self.sort_and_merge_fragments(&mut fragments);
459 }
460
461 if self.options.preserve_layout && !fragments.is_empty() {
463 extracted_text = self.reconstruct_text_from_fragments(&fragments);
464 }
465
466 Ok(ExtractedText {
467 text: extracted_text,
468 fragments,
469 })
470 }
471
472 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
474 fragments.sort_by(|a, b| {
476 let y_diff = (b.y - a.y).abs();
478 if y_diff < self.options.newline_threshold {
479 a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
481 } else {
482 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
484 }
485 });
486
487 if self.options.detect_columns {
489 self.detect_and_sort_columns(fragments);
490 }
491 }
492
493 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
495 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
497 let mut current_line: Vec<&mut TextFragment> = Vec::new();
498 let mut last_y = f64::INFINITY;
499
500 for fragment in fragments.iter_mut() {
501 let fragment_y = fragment.y;
502 if (last_y - fragment_y).abs() > self.options.newline_threshold
503 && !current_line.is_empty()
504 {
505 lines.push(current_line);
506 current_line = Vec::new();
507 }
508 current_line.push(fragment);
509 last_y = fragment_y;
510 }
511 if !current_line.is_empty() {
512 lines.push(current_line);
513 }
514
515 let mut column_boundaries = vec![0.0];
517 for line in &lines {
518 if line.len() > 1 {
519 for i in 0..line.len() - 1 {
520 let gap = line[i + 1].x - (line[i].x + line[i].width);
521 if gap > self.options.column_threshold {
522 let boundary = line[i].x + line[i].width + gap / 2.0;
523 if !column_boundaries
524 .iter()
525 .any(|&b| (b - boundary).abs() < 10.0)
526 {
527 column_boundaries.push(boundary);
528 }
529 }
530 }
531 }
532 }
533 column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
534
535 if column_boundaries.len() > 1 {
537 fragments.sort_by(|a, b| {
538 let col_a = column_boundaries
540 .iter()
541 .position(|&boundary| a.x < boundary)
542 .unwrap_or(column_boundaries.len())
543 - 1;
544 let col_b = column_boundaries
545 .iter()
546 .position(|&boundary| b.x < boundary)
547 .unwrap_or(column_boundaries.len())
548 - 1;
549
550 if col_a != col_b {
551 col_a.cmp(&col_b)
552 } else {
553 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
555 }
556 });
557 }
558 }
559
560 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
562 let mut result = String::new();
563 let mut last_y = f64::INFINITY;
564 let mut last_x = 0.0;
565 let mut last_line_ended_with_hyphen = false;
566
567 for fragment in fragments {
568 let y_diff = (last_y - fragment.y).abs();
570 if !result.is_empty() && y_diff > self.options.newline_threshold {
571 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
573 if result.ends_with('-') {
575 result.pop();
576 }
577 } else {
578 result.push('\n');
579 }
580 } else if !result.is_empty() {
581 let x_gap = fragment.x - last_x;
583 if x_gap > self.options.space_threshold * fragment.font_size {
584 result.push(' ');
585 }
586 }
587
588 result.push_str(&fragment.text);
589 last_line_ended_with_hyphen = fragment.text.ends_with('-');
590 last_y = fragment.y;
591 last_x = fragment.x + fragment.width;
592 }
593
594 result
595 }
596
597 fn extract_font_resources<R: Read + Seek>(
599 &mut self,
600 page: &ParsedPage,
601 document: &PdfDocument<R>,
602 ) -> ParseResult<()> {
603 self.font_cache.clear();
605
606 if let Some(resources) = page.get_resources() {
608 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
609 for (font_name, font_obj) in font_dict.0.iter() {
611 if let Some(font_ref) = font_obj.as_reference() {
612 if let Ok(PdfObject::Dictionary(font_dict)) =
613 document.get_object(font_ref.0, font_ref.1)
614 {
615 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
617
618 if let Ok(font_info) =
619 cmap_extractor.extract_font_info(&font_dict, document)
620 {
621 self.font_cache.insert(font_name.0.clone(), font_info);
622 tracing::debug!(
623 "Cached font: {} -> {:?}",
624 font_name.0,
625 self.font_cache.get(&font_name.0)
626 );
627 }
628 }
629 }
630 }
631 }
632 }
633
634 Ok(())
635 }
636
637 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
639 use crate::text::encoding::TextEncoding;
640
641 if let Some(ref font_name) = state.font_name {
643 if let Some(font_info) = self.font_cache.get(font_name) {
644 let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
646
647 if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
649 tracing::debug!(
650 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
651 font_name,
652 text,
653 decoded
654 );
655 return Ok(decoded);
656 }
657
658 tracing::debug!(
659 "CMap decoding failed for font {}, falling back to encoding",
660 font_name
661 );
662 }
663 }
664
665 let encoding = if let Some(ref font_name) = state.font_name {
667 match font_name.to_lowercase().as_str() {
668 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
669 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
670 name if name.contains("standard") => TextEncoding::StandardEncoding,
671 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
672 _ => {
673 if font_name.starts_with("Times")
675 || font_name.starts_with("Helvetica")
676 || font_name.starts_with("Courier")
677 {
678 TextEncoding::WinAnsiEncoding } else {
680 TextEncoding::PdfDocEncoding }
682 }
683 }
684 } else {
685 TextEncoding::WinAnsiEncoding };
687
688 let fallback_result = encoding.decode(text);
689 tracing::debug!(
690 "Fallback encoding decoding: {:?} -> \"{}\"",
691 text,
692 fallback_result
693 );
694 Ok(fallback_result)
695 }
696}
697
698impl Default for TextExtractor {
699 fn default() -> Self {
700 Self::new()
701 }
702}
703
704fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
706 [
707 a[0] * b[0] + a[1] * b[2],
708 a[0] * b[1] + a[1] * b[3],
709 a[2] * b[0] + a[3] * b[2],
710 a[2] * b[1] + a[3] * b[3],
711 a[4] * b[0] + a[5] * b[2] + b[4],
712 a[4] * b[1] + a[5] * b[3] + b[5],
713 ]
714}
715
716fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
718 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
719 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
720 (tx, ty)
721}
722
723fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
725 if let Some(font) = font_info {
727 if let Some(ref widths) = font.metrics.widths {
728 let first_char = font.metrics.first_char.unwrap_or(0);
729 let last_char = font.metrics.last_char.unwrap_or(255);
730 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
731
732 let mut total_width = 0.0;
733 let chars: Vec<char> = text.chars().collect();
734
735 for (i, &ch) in chars.iter().enumerate() {
736 let char_code = ch as u32;
737
738 let width = if char_code >= first_char && char_code <= last_char {
740 let index = (char_code - first_char) as usize;
741 widths.get(index).copied().unwrap_or(missing_width)
742 } else {
743 missing_width
744 };
745
746 total_width += width / 1000.0 * font_size;
748
749 if let Some(ref kerning) = font.metrics.kerning {
751 if i + 1 < chars.len() {
752 let next_char = chars[i + 1] as u32;
753 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
754 total_width += kern_value / 1000.0 * font_size;
756 }
757 }
758 }
759 }
760
761 return total_width;
762 }
763 }
764
765 text.len() as f64 * font_size * 0.5
767}
768
769#[cfg(test)]
770mod tests {
771 use super::*;
772
773 #[test]
774 fn test_matrix_multiplication() {
775 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
776 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
777
778 let result = multiply_matrix(&identity, &translation);
779 assert_eq!(result, translation);
780
781 let result2 = multiply_matrix(&translation, &identity);
782 assert_eq!(result2, translation);
783 }
784
785 #[test]
786 fn test_transform_point() {
787 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
788 let (x, y) = transform_point(5.0, 5.0, &translation);
789 assert_eq!(x, 15.0);
790 assert_eq!(y, 25.0);
791 }
792
793 #[test]
794 fn test_extraction_options_default() {
795 let options = ExtractionOptions::default();
796 assert!(!options.preserve_layout);
797 assert_eq!(options.space_threshold, 0.2);
798 assert_eq!(options.newline_threshold, 10.0);
799 assert!(options.sort_by_position);
800 assert!(!options.detect_columns);
801 assert_eq!(options.column_threshold, 50.0);
802 assert!(options.merge_hyphenated);
803 }
804
805 #[test]
806 fn test_extraction_options_custom() {
807 let options = ExtractionOptions {
808 preserve_layout: true,
809 space_threshold: 0.5,
810 newline_threshold: 15.0,
811 sort_by_position: false,
812 detect_columns: true,
813 column_threshold: 75.0,
814 merge_hyphenated: false,
815 };
816 assert!(options.preserve_layout);
817 assert_eq!(options.space_threshold, 0.5);
818 assert_eq!(options.newline_threshold, 15.0);
819 assert!(!options.sort_by_position);
820 assert!(options.detect_columns);
821 assert_eq!(options.column_threshold, 75.0);
822 assert!(!options.merge_hyphenated);
823 }
824
825 #[test]
826 fn test_parse_font_style_bold() {
827 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
829 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
830
831 assert_eq!(parse_font_style("Arial Bold"), (true, false));
833 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
834
835 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
837 }
838
839 #[test]
840 fn test_parse_font_style_italic() {
841 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
843 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
844
845 assert_eq!(parse_font_style("Arial Italic"), (false, true));
847 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
848
849 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
851 }
852
853 #[test]
854 fn test_parse_font_style_bold_italic() {
855 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
856 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
857 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
858 }
859
860 #[test]
861 fn test_parse_font_style_regular() {
862 assert_eq!(parse_font_style("Helvetica"), (false, false));
863 assert_eq!(parse_font_style("Times-Roman"), (false, false));
864 assert_eq!(parse_font_style("Courier"), (false, false));
865 assert_eq!(parse_font_style("Arial"), (false, false));
866 }
867
868 #[test]
869 fn test_parse_font_style_edge_cases() {
870 assert_eq!(parse_font_style(""), (false, false));
872 assert_eq!(parse_font_style("UnknownFont"), (false, false));
873
874 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
876 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
877 }
878
879 #[test]
880 fn test_text_fragment() {
881 let fragment = TextFragment {
882 text: "Hello".to_string(),
883 x: 100.0,
884 y: 200.0,
885 width: 50.0,
886 height: 12.0,
887 font_size: 10.0,
888 font_name: None,
889 is_bold: false,
890 is_italic: false,
891 };
892 assert_eq!(fragment.text, "Hello");
893 assert_eq!(fragment.x, 100.0);
894 assert_eq!(fragment.y, 200.0);
895 assert_eq!(fragment.width, 50.0);
896 assert_eq!(fragment.height, 12.0);
897 assert_eq!(fragment.font_size, 10.0);
898 }
899
900 #[test]
901 fn test_extracted_text() {
902 let fragments = vec![
903 TextFragment {
904 text: "Hello".to_string(),
905 x: 100.0,
906 y: 200.0,
907 width: 50.0,
908 height: 12.0,
909 font_size: 10.0,
910 font_name: None,
911 is_bold: false,
912 is_italic: false,
913 },
914 TextFragment {
915 text: "World".to_string(),
916 x: 160.0,
917 y: 200.0,
918 width: 50.0,
919 height: 12.0,
920 font_size: 10.0,
921 font_name: None,
922 is_bold: false,
923 is_italic: false,
924 },
925 ];
926
927 let extracted = ExtractedText {
928 text: "Hello World".to_string(),
929 fragments: fragments.clone(),
930 };
931
932 assert_eq!(extracted.text, "Hello World");
933 assert_eq!(extracted.fragments.len(), 2);
934 assert_eq!(extracted.fragments[0].text, "Hello");
935 assert_eq!(extracted.fragments[1].text, "World");
936 }
937
938 #[test]
939 fn test_text_state_default() {
940 let state = TextState::default();
941 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
942 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
943 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
944 assert_eq!(state.leading, 0.0);
945 assert_eq!(state.char_space, 0.0);
946 assert_eq!(state.word_space, 0.0);
947 assert_eq!(state.horizontal_scale, 100.0);
948 assert_eq!(state.text_rise, 0.0);
949 assert_eq!(state.font_size, 0.0);
950 assert!(state.font_name.is_none());
951 assert_eq!(state.render_mode, 0);
952 }
953
954 #[test]
955 fn test_matrix_operations() {
956 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
959 assert_eq!(x, 0.0);
960 assert_eq!(y, 1.0);
961
962 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
964 let (x, y) = transform_point(5.0, 5.0, &scale);
965 assert_eq!(x, 10.0);
966 assert_eq!(y, 15.0);
967
968 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
970 let (x, y) = transform_point(1.0, 1.0, &complex);
971 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
974
975 #[test]
976 fn test_text_extractor_new() {
977 let extractor = TextExtractor::new();
978 let options = extractor.options;
979 assert!(!options.preserve_layout);
980 assert_eq!(options.space_threshold, 0.2);
981 assert_eq!(options.newline_threshold, 10.0);
982 assert!(options.sort_by_position);
983 assert!(!options.detect_columns);
984 assert_eq!(options.column_threshold, 50.0);
985 assert!(options.merge_hyphenated);
986 }
987
988 #[test]
989 fn test_text_extractor_with_options() {
990 let options = ExtractionOptions {
991 preserve_layout: true,
992 space_threshold: 0.3,
993 newline_threshold: 12.0,
994 sort_by_position: false,
995 detect_columns: true,
996 column_threshold: 60.0,
997 merge_hyphenated: false,
998 };
999 let extractor = TextExtractor::with_options(options.clone());
1000 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1001 assert_eq!(extractor.options.space_threshold, options.space_threshold);
1002 assert_eq!(
1003 extractor.options.newline_threshold,
1004 options.newline_threshold
1005 );
1006 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1007 assert_eq!(extractor.options.detect_columns, options.detect_columns);
1008 assert_eq!(extractor.options.column_threshold, options.column_threshold);
1009 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1010 }
1011
1012 #[test]
1017 fn test_calculate_text_width_with_no_font_info() {
1018 let width = calculate_text_width("Hello", 12.0, None);
1020
1021 assert_eq!(
1023 width, 30.0,
1024 "Without font info, should use simplified calculation: len * font_size * 0.5"
1025 );
1026 }
1027
1028 #[test]
1029 fn test_calculate_text_width_with_empty_metrics() {
1030 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1031
1032 let font_info = FontInfo {
1034 name: "TestFont".to_string(),
1035 font_type: "Type1".to_string(),
1036 encoding: None,
1037 to_unicode: None,
1038 differences: None,
1039 descendant_font: None,
1040 cid_to_gid_map: None,
1041 metrics: FontMetrics {
1042 first_char: None,
1043 last_char: None,
1044 widths: None,
1045 missing_width: Some(500.0),
1046 kerning: None,
1047 },
1048 };
1049
1050 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1051
1052 assert_eq!(
1054 width, 30.0,
1055 "Without widths array, should fall back to simplified calculation"
1056 );
1057 }
1058
1059 #[test]
1060 fn test_calculate_text_width_with_complete_metrics() {
1061 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1062
1063 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
1074 name: "Helvetica".to_string(),
1075 font_type: "Type1".to_string(),
1076 encoding: None,
1077 to_unicode: None,
1078 differences: None,
1079 descendant_font: None,
1080 cid_to_gid_map: None,
1081 metrics: FontMetrics {
1082 first_char: Some(32),
1083 last_char: Some(126),
1084 widths: Some(widths),
1085 missing_width: Some(500.0),
1086 kerning: None,
1087 },
1088 };
1089
1090 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1091
1092 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1100 let tolerance = 0.0001; assert!(
1102 (width - expected).abs() < tolerance,
1103 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1104 expected,
1105 width,
1106 (width - expected).abs()
1107 );
1108
1109 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
1112 width, simplified,
1113 "Metrics-based calculation should differ from simplified (30.0)"
1114 );
1115 }
1116
1117 #[test]
1118 fn test_calculate_text_width_character_outside_range() {
1119 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1120
1121 let widths = vec![722.0; 26]; let font_info = FontInfo {
1125 name: "TestFont".to_string(),
1126 font_type: "Type1".to_string(),
1127 encoding: None,
1128 to_unicode: None,
1129 differences: None,
1130 descendant_font: None,
1131 cid_to_gid_map: None,
1132 metrics: FontMetrics {
1133 first_char: Some(65), last_char: Some(90), widths: Some(widths),
1136 missing_width: Some(500.0),
1137 kerning: None,
1138 },
1139 };
1140
1141 let width = calculate_text_width("A1", 10.0, Some(&font_info));
1143
1144 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1149 assert_eq!(
1150 width, expected,
1151 "Should use missing_width for characters outside range"
1152 );
1153 }
1154
1155 #[test]
1156 fn test_calculate_text_width_missing_width_in_array() {
1157 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1158
1159 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1164 name: "TestFont".to_string(),
1165 font_type: "Type1".to_string(),
1166 encoding: None,
1167 to_unicode: None,
1168 differences: None,
1169 descendant_font: None,
1170 cid_to_gid_map: None,
1171 metrics: FontMetrics {
1172 first_char: Some(32),
1173 last_char: Some(126),
1174 widths: Some(widths),
1175 missing_width: Some(600.0),
1176 kerning: None,
1177 },
1178 };
1179
1180 let char_code = 42u8 as char; let text = char_code.to_string();
1183 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1184
1185 assert_eq!(
1188 width, 0.0,
1189 "Should use 0.0 width from array, not missing_width"
1190 );
1191 }
1192
1193 #[test]
1194 fn test_calculate_text_width_empty_string() {
1195 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1196
1197 let font_info = FontInfo {
1198 name: "TestFont".to_string(),
1199 font_type: "Type1".to_string(),
1200 encoding: None,
1201 to_unicode: None,
1202 differences: None,
1203 descendant_font: None,
1204 cid_to_gid_map: None,
1205 metrics: FontMetrics {
1206 first_char: Some(32),
1207 last_char: Some(126),
1208 widths: Some(vec![500.0; 95]),
1209 missing_width: Some(500.0),
1210 kerning: None,
1211 },
1212 };
1213
1214 let width = calculate_text_width("", 12.0, Some(&font_info));
1215 assert_eq!(width, 0.0, "Empty string should have zero width");
1216
1217 let width_no_font = calculate_text_width("", 12.0, None);
1219 assert_eq!(
1220 width_no_font, 0.0,
1221 "Empty string should have zero width (no font)"
1222 );
1223 }
1224
1225 #[test]
1226 fn test_calculate_text_width_unicode_characters() {
1227 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1228
1229 let font_info = FontInfo {
1231 name: "TestFont".to_string(),
1232 font_type: "Type1".to_string(),
1233 encoding: None,
1234 to_unicode: None,
1235 differences: None,
1236 descendant_font: None,
1237 cid_to_gid_map: None,
1238 metrics: FontMetrics {
1239 first_char: Some(32),
1240 last_char: Some(126),
1241 widths: Some(vec![500.0; 95]),
1242 missing_width: Some(600.0),
1243 kerning: None,
1244 },
1245 };
1246
1247 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1249
1250 assert_eq!(
1253 width, 6.0,
1254 "Unicode character outside range should use missing_width"
1255 );
1256 }
1257
1258 #[test]
1259 fn test_calculate_text_width_different_font_sizes() {
1260 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1261
1262 let font_info = FontInfo {
1263 name: "TestFont".to_string(),
1264 font_type: "Type1".to_string(),
1265 encoding: None,
1266 to_unicode: None,
1267 differences: None,
1268 descendant_font: None,
1269 cid_to_gid_map: None,
1270 metrics: FontMetrics {
1271 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1274 missing_width: Some(500.0),
1275 kerning: None,
1276 },
1277 };
1278
1279 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1281 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1282
1283 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1285 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1286 assert_eq!(
1287 width_20,
1288 width_10 * 2.0,
1289 "Width should scale linearly with font size"
1290 );
1291 }
1292
1293 #[test]
1294 fn test_calculate_text_width_proportional_vs_monospace() {
1295 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1296
1297 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1300 name: "Helvetica".to_string(),
1301 font_type: "Type1".to_string(),
1302 encoding: None,
1303 to_unicode: None,
1304 differences: None,
1305 descendant_font: None,
1306 cid_to_gid_map: None,
1307 metrics: FontMetrics {
1308 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1311 missing_width: Some(500.0),
1312 kerning: None,
1313 },
1314 };
1315
1316 let monospace_widths = vec![600.0, 600.0, 600.0];
1318 let monospace_font = FontInfo {
1319 name: "Courier".to_string(),
1320 font_type: "Type1".to_string(),
1321 encoding: None,
1322 to_unicode: None,
1323 differences: None,
1324 descendant_font: None,
1325 cid_to_gid_map: None,
1326 metrics: FontMetrics {
1327 first_char: Some(105),
1328 last_char: Some(107),
1329 widths: Some(monospace_widths),
1330 missing_width: Some(600.0),
1331 kerning: None,
1332 },
1333 };
1334
1335 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1336 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1337
1338 assert!(
1340 prop_width < mono_width,
1341 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1342 prop_width,
1343 mono_width
1344 );
1345 }
1346
1347 #[test]
1352 fn test_calculate_text_width_with_kerning() {
1353 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1354 use std::collections::HashMap;
1355
1356 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1363 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1368 name: "Helvetica".to_string(),
1369 font_type: "Type1".to_string(),
1370 encoding: None,
1371 to_unicode: None,
1372 differences: None,
1373 descendant_font: None,
1374 cid_to_gid_map: None,
1375 metrics: FontMetrics {
1376 first_char: Some(32),
1377 last_char: Some(126),
1378 widths: Some(widths),
1379 missing_width: Some(500.0),
1380 kerning: Some(kerning),
1381 },
1382 };
1383
1384 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1386 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1389 let tolerance = 0.0001;
1390 assert!(
1391 (width_av - expected_av).abs() < tolerance,
1392 "AV with kerning: expected {}, got {}, diff {}",
1393 expected_av,
1394 width_av,
1395 (width_av - expected_av).abs()
1396 );
1397
1398 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1400 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1403 assert!(
1404 (width_aw - expected_aw).abs() < tolerance,
1405 "AW with kerning: expected {}, got {}, diff {}",
1406 expected_aw,
1407 width_aw,
1408 (width_aw - expected_aw).abs()
1409 );
1410
1411 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1413 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1415 assert!(
1416 (width_va - expected_va).abs() < tolerance,
1417 "VA without kerning: expected {}, got {}, diff {}",
1418 expected_va,
1419 width_va,
1420 (width_va - expected_va).abs()
1421 );
1422
1423 assert!(
1425 width_av < width_va,
1426 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1427 width_av,
1428 width_va
1429 );
1430 }
1431
1432 #[test]
1433 fn test_parse_truetype_kern_table_minimal() {
1434 use crate::text::extraction_cmap::parse_truetype_kern_table;
1435
1436 let mut ttf_data = vec![
1444 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1451
1452 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1466
1467 ttf_data.extend_from_slice(&[
1469 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1489
1490 let result = parse_truetype_kern_table(&ttf_data);
1491 assert!(
1492 result.is_ok(),
1493 "Should parse minimal kern table successfully: {:?}",
1494 result.err()
1495 );
1496
1497 let kerning_map = result.unwrap();
1498 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1499
1500 assert_eq!(
1502 kerning_map.get(&(65, 86)),
1503 Some(&-50.0),
1504 "Should have A+V kerning pair with value -50"
1505 );
1506
1507 assert_eq!(
1509 kerning_map.get(&(65, 87)),
1510 Some(&-40.0),
1511 "Should have A+W kerning pair with value -40"
1512 );
1513 }
1514
1515 #[test]
1516 fn test_parse_kern_table_no_kern_table() {
1517 use crate::text::extraction_cmap::extract_truetype_kerning;
1518
1519 let ttf_data = vec![
1524 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1537 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1538 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1539 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1540 ];
1541
1542 let result = extract_truetype_kerning(&ttf_data);
1543 assert!(
1544 result.is_ok(),
1545 "Should gracefully handle missing kern table"
1546 );
1547
1548 let kerning_map = result.unwrap();
1549 assert!(
1550 kerning_map.is_empty(),
1551 "Should return empty HashMap when no kern table exists"
1552 );
1553 }
1554}