1use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19 pub preserve_layout: bool,
21 pub space_threshold: f64,
23 pub newline_threshold: f64,
25 pub sort_by_position: bool,
27 pub detect_columns: bool,
29 pub column_threshold: f64,
31 pub merge_hyphenated: bool,
33}
34
35impl Default for ExtractionOptions {
36 fn default() -> Self {
37 Self {
38 preserve_layout: false,
39 space_threshold: 0.3,
40 newline_threshold: 10.0,
41 sort_by_position: true,
42 detect_columns: false,
43 column_threshold: 50.0,
44 merge_hyphenated: true,
45 }
46 }
47}
48
49#[derive(Debug, Clone)]
51pub struct ExtractedText {
52 pub text: String,
54 pub fragments: Vec<TextFragment>,
56}
57
58#[derive(Debug, Clone)]
60pub struct TextFragment {
61 pub text: String,
63 pub x: f64,
65 pub y: f64,
67 pub width: f64,
69 pub height: f64,
71 pub font_size: f64,
73 pub font_name: Option<String>,
75 pub is_bold: bool,
77 pub is_italic: bool,
79 pub color: Option<Color>,
81}
82
83struct TextState {
85 text_matrix: [f64; 6],
87 text_line_matrix: [f64; 6],
89 ctm: [f64; 6],
91 leading: f64,
93 char_space: f64,
95 word_space: f64,
97 horizontal_scale: f64,
99 text_rise: f64,
101 font_size: f64,
103 font_name: Option<String>,
105 render_mode: u8,
107 fill_color: Option<Color>,
109}
110
111impl Default for TextState {
112 fn default() -> Self {
113 Self {
114 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
115 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
116 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
117 leading: 0.0,
118 char_space: 0.0,
119 word_space: 0.0,
120 horizontal_scale: 100.0,
121 text_rise: 0.0,
122 font_size: 0.0,
123 font_name: None,
124 render_mode: 0,
125 fill_color: None,
126 }
127 }
128}
129
130pub fn parse_font_style(font_name: &str) -> (bool, bool) {
151 let name_lower = font_name.to_lowercase();
152
153 let is_bold = name_lower.contains("bold")
155 || name_lower.contains("-b")
156 || name_lower.contains(" b ")
157 || name_lower.ends_with(" b");
158
159 let is_italic = name_lower.contains("italic")
161 || name_lower.contains("oblique")
162 || name_lower.contains("-i")
163 || name_lower.contains(" i ")
164 || name_lower.ends_with(" i");
165
166 (is_bold, is_italic)
167}
168
169pub struct TextExtractor {
171 options: ExtractionOptions,
172 font_cache: HashMap<String, FontInfo>,
174}
175
176impl TextExtractor {
177 pub fn new() -> Self {
179 Self {
180 options: ExtractionOptions::default(),
181 font_cache: HashMap::new(),
182 }
183 }
184
185 pub fn with_options(options: ExtractionOptions) -> Self {
187 Self {
188 options,
189 font_cache: HashMap::new(),
190 }
191 }
192
193 pub fn extract_from_document<R: Read + Seek>(
195 &mut self,
196 document: &PdfDocument<R>,
197 ) -> ParseResult<Vec<ExtractedText>> {
198 let page_count = document.page_count()?;
199 let mut results = Vec::new();
200
201 for i in 0..page_count {
202 let text = self.extract_from_page(document, i)?;
203 results.push(text);
204 }
205
206 Ok(results)
207 }
208
209 pub fn extract_from_page<R: Read + Seek>(
211 &mut self,
212 document: &PdfDocument<R>,
213 page_index: u32,
214 ) -> ParseResult<ExtractedText> {
215 let page = document.get_page(page_index)?;
217
218 self.extract_font_resources(&page, document)?;
220
221 let streams = page.content_streams_with_document(document)?;
223
224 let mut extracted_text = String::new();
225 let mut fragments = Vec::new();
226 let mut state = TextState::default();
227 let mut in_text_object = false;
228 let mut last_x = 0.0;
229 let mut last_y = 0.0;
230
231 for (stream_idx, stream_data) in streams.iter().enumerate() {
233 let operations = match ContentParser::parse_content(stream_data) {
234 Ok(ops) => ops,
235 Err(e) => {
236 tracing::debug!(
238 "Warning: Failed to parse content stream on page {}, stream {}/{}",
239 page_index + 1,
240 stream_idx + 1,
241 streams.len()
242 );
243 tracing::debug!(" Error: {}", e);
244 tracing::debug!(" Stream size: {} bytes", stream_data.len());
245
246 let preview_len = stream_data.len().min(100);
248 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
249 tracing::debug!(
250 " Stream preview (first {} bytes): {:?}",
251 preview_len,
252 preview.chars().take(80).collect::<String>()
253 );
254
255 continue;
257 }
258 };
259
260 for op in operations {
261 match op {
262 ContentOperation::BeginText => {
263 in_text_object = true;
264 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
266 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
267 }
268
269 ContentOperation::EndText => {
270 in_text_object = false;
271 }
272
273 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
274 state.text_matrix =
275 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
276 state.text_line_matrix =
277 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
278 }
279
280 ContentOperation::MoveText(tx, ty) => {
281 let new_matrix = multiply_matrix(
283 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
284 &state.text_line_matrix,
285 );
286 state.text_matrix = new_matrix;
287 state.text_line_matrix = new_matrix;
288 }
289
290 ContentOperation::NextLine => {
291 let new_matrix = multiply_matrix(
293 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
294 &state.text_line_matrix,
295 );
296 state.text_matrix = new_matrix;
297 state.text_line_matrix = new_matrix;
298 }
299
300 ContentOperation::ShowText(text) => {
301 if in_text_object {
302 let text_bytes = &text;
303 let decoded = self.decode_text(text_bytes, &state)?;
304
305 let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
308 let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
309
310 if !extracted_text.is_empty() {
312 let dx = x - last_x;
313 let dy = (y - last_y).abs();
314
315 if dy > self.options.newline_threshold {
316 extracted_text.push('\n');
317 } else if dx > self.options.space_threshold * state.font_size {
318 extracted_text.push(' ');
319 }
320 }
321
322 extracted_text.push_str(&decoded);
323
324 let font_info = state
326 .font_name
327 .as_ref()
328 .and_then(|name| self.font_cache.get(name));
329
330 if self.options.preserve_layout {
331 let (is_bold, is_italic) = state
333 .font_name
334 .as_ref()
335 .map(|name| parse_font_style(name))
336 .unwrap_or((false, false));
337
338 fragments.push(TextFragment {
339 text: decoded.clone(),
340 x,
341 y,
342 width: calculate_text_width(
343 &decoded,
344 state.font_size,
345 font_info,
346 ),
347 height: state.font_size,
348 font_size: state.font_size,
349 font_name: state.font_name.clone(),
350 is_bold,
351 is_italic,
352 color: state.fill_color,
353 });
354 }
355
356 last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
358 last_y = y;
359
360 let text_width =
362 calculate_text_width(&decoded, state.font_size, font_info);
363 let tx = text_width * state.horizontal_scale / 100.0;
364 state.text_matrix =
365 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
366 }
367 }
368
369 ContentOperation::ShowTextArray(array) => {
370 if in_text_object {
371 let font_info = state
373 .font_name
374 .as_ref()
375 .and_then(|name| self.font_cache.get(name));
376
377 for item in array {
378 match item {
379 TextElement::Text(text_bytes) => {
380 let decoded = self.decode_text(&text_bytes, &state)?;
381 extracted_text.push_str(&decoded);
382
383 let text_width = calculate_text_width(
385 &decoded,
386 state.font_size,
387 font_info,
388 );
389 let tx = text_width * state.horizontal_scale / 100.0;
390 state.text_matrix = multiply_matrix(
391 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
392 &state.text_matrix,
393 );
394 }
395 TextElement::Spacing(adjustment) => {
396 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
398 state.text_matrix = multiply_matrix(
399 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
400 &state.text_matrix,
401 );
402 }
403 }
404 }
405 }
406 }
407
408 ContentOperation::SetFont(name, size) => {
409 state.font_name = Some(name);
410 state.font_size = size as f64;
411 }
412
413 ContentOperation::SetLeading(leading) => {
414 state.leading = leading as f64;
415 }
416
417 ContentOperation::SetCharSpacing(spacing) => {
418 state.char_space = spacing as f64;
419 }
420
421 ContentOperation::SetWordSpacing(spacing) => {
422 state.word_space = spacing as f64;
423 }
424
425 ContentOperation::SetHorizontalScaling(scale) => {
426 state.horizontal_scale = scale as f64;
427 }
428
429 ContentOperation::SetTextRise(rise) => {
430 state.text_rise = rise as f64;
431 }
432
433 ContentOperation::SetTextRenderMode(mode) => {
434 state.render_mode = mode as u8;
435 }
436
437 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
438 let [a0, b0, c0, d0, e0, f0] = state.ctm;
440 let a = a as f64;
441 let b = b as f64;
442 let c = c as f64;
443 let d = d as f64;
444 let e = e as f64;
445 let f = f as f64;
446 state.ctm = [
447 a * a0 + b * c0,
448 a * b0 + b * d0,
449 c * a0 + d * c0,
450 c * b0 + d * d0,
451 e * a0 + f * c0 + e0,
452 e * b0 + f * d0 + f0,
453 ];
454 }
455
456 ContentOperation::SetNonStrokingGray(gray) => {
458 state.fill_color = Some(Color::gray(gray as f64));
459 }
460
461 ContentOperation::SetNonStrokingRGB(r, g, b) => {
462 state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
463 }
464
465 ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
466 state.fill_color =
467 Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
468 }
469
470 _ => {
471 }
473 }
474 }
475 }
476
477 if self.options.sort_by_position && !fragments.is_empty() {
479 self.sort_and_merge_fragments(&mut fragments);
480 }
481
482 if self.options.preserve_layout && !fragments.is_empty() {
485 fragments = self.merge_close_fragments(&fragments);
486 }
487
488 if self.options.preserve_layout && !fragments.is_empty() {
490 extracted_text = self.reconstruct_text_from_fragments(&fragments);
491 }
492
493 Ok(ExtractedText {
494 text: extracted_text,
495 fragments,
496 })
497 }
498
499 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
501 let threshold = self.options.newline_threshold;
509 fragments.sort_by(|a, b| {
510 let band_a = if threshold > 0.0 {
512 (-a.y / threshold).round()
513 } else {
514 -a.y
515 };
516 let band_b = if threshold > 0.0 {
517 (-b.y / threshold).round()
518 } else {
519 -b.y
520 };
521
522 band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
524 });
525
526 if self.options.detect_columns {
528 self.detect_and_sort_columns(fragments);
529 }
530 }
531
532 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
534 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
536 let mut current_line: Vec<&mut TextFragment> = Vec::new();
537 let mut last_y = f64::INFINITY;
538
539 for fragment in fragments.iter_mut() {
540 let fragment_y = fragment.y;
541 if (last_y - fragment_y).abs() > self.options.newline_threshold
542 && !current_line.is_empty()
543 {
544 lines.push(current_line);
545 current_line = Vec::new();
546 }
547 current_line.push(fragment);
548 last_y = fragment_y;
549 }
550 if !current_line.is_empty() {
551 lines.push(current_line);
552 }
553
554 let mut column_boundaries = vec![0.0];
556 for line in &lines {
557 if line.len() > 1 {
558 for i in 0..line.len() - 1 {
559 let gap = line[i + 1].x - (line[i].x + line[i].width);
560 if gap > self.options.column_threshold {
561 let boundary = line[i].x + line[i].width + gap / 2.0;
562 if !column_boundaries
563 .iter()
564 .any(|&b| (b - boundary).abs() < 10.0)
565 {
566 column_boundaries.push(boundary);
567 }
568 }
569 }
570 }
571 }
572 column_boundaries.sort_by(|a, b| a.total_cmp(b));
573
574 if column_boundaries.len() > 1 {
576 fragments.sort_by(|a, b| {
577 let col_a = column_boundaries
579 .iter()
580 .position(|&boundary| a.x < boundary)
581 .unwrap_or(column_boundaries.len())
582 - 1;
583 let col_b = column_boundaries
584 .iter()
585 .position(|&boundary| b.x < boundary)
586 .unwrap_or(column_boundaries.len())
587 - 1;
588
589 if col_a != col_b {
590 col_a.cmp(&col_b)
591 } else {
592 b.y.total_cmp(&a.y)
594 }
595 });
596 }
597 }
598
599 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
601 let merged_fragments = self.merge_close_fragments(fragments);
603
604 let mut result = String::new();
605 let mut last_y = f64::INFINITY;
606 let mut last_x = 0.0;
607 let mut last_line_ended_with_hyphen = false;
608
609 for fragment in &merged_fragments {
610 let y_diff = (last_y - fragment.y).abs();
612 if !result.is_empty() && y_diff > self.options.newline_threshold {
613 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
615 if result.ends_with('-') {
617 result.pop();
618 }
619 } else {
620 result.push('\n');
621 }
622 } else if !result.is_empty() {
623 let x_gap = fragment.x - last_x;
625 if x_gap > self.options.space_threshold * fragment.font_size {
626 result.push(' ');
627 }
628 }
629
630 result.push_str(&fragment.text);
631 last_line_ended_with_hyphen = fragment.text.ends_with('-');
632 last_y = fragment.y;
633 last_x = fragment.x + fragment.width;
634 }
635
636 result
637 }
638
639 fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
642 if fragments.is_empty() {
643 return Vec::new();
644 }
645
646 let mut merged = Vec::new();
647 let mut current = fragments[0].clone();
648
649 for fragment in &fragments[1..] {
650 let y_diff = (current.y - fragment.y).abs();
652 let x_gap = fragment.x - (current.x + current.width);
653
654 let should_merge = y_diff < 1.0 && x_gap >= 0.0 && x_gap < fragment.font_size * 0.5; if should_merge {
661 current.text.push_str(&fragment.text);
663 current.width = (fragment.x + fragment.width) - current.x;
664 } else {
665 merged.push(current);
667 current = fragment.clone();
668 }
669 }
670
671 merged.push(current);
672 merged
673 }
674
675 fn extract_font_resources<R: Read + Seek>(
677 &mut self,
678 page: &ParsedPage,
679 document: &PdfDocument<R>,
680 ) -> ParseResult<()> {
681 self.font_cache.clear();
683
684 if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
687 if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
688 {
689 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
690 for (font_name, font_obj) in font_dict.0.iter() {
692 if let Some(font_ref) = font_obj.as_reference() {
693 if let Ok(PdfObject::Dictionary(font_dict)) =
694 document.get_object(font_ref.0, font_ref.1)
695 {
696 let mut cmap_extractor: CMapTextExtractor<R> =
698 CMapTextExtractor::new();
699
700 if let Ok(font_info) =
701 cmap_extractor.extract_font_info(&font_dict, document)
702 {
703 let has_to_unicode = font_info.to_unicode.is_some();
704 self.font_cache.insert(font_name.0.clone(), font_info);
705 tracing::debug!(
706 "Cached font: {} (ToUnicode: {})",
707 font_name.0,
708 has_to_unicode
709 );
710 }
711 }
712 }
713 }
714 }
715 }
716 } else if let Some(resources) = page.get_resources() {
717 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
719 for (font_name, font_obj) in font_dict.0.iter() {
720 if let Some(font_ref) = font_obj.as_reference() {
721 if let Ok(PdfObject::Dictionary(font_dict)) =
722 document.get_object(font_ref.0, font_ref.1)
723 {
724 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
725
726 if let Ok(font_info) =
727 cmap_extractor.extract_font_info(&font_dict, document)
728 {
729 let has_to_unicode = font_info.to_unicode.is_some();
730 self.font_cache.insert(font_name.0.clone(), font_info);
731 tracing::debug!(
732 "Cached font: {} (ToUnicode: {})",
733 font_name.0,
734 has_to_unicode
735 );
736 }
737 }
738 }
739 }
740 }
741 }
742
743 Ok(())
744 }
745
746 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
748 use crate::text::encoding::TextEncoding;
749
750 if let Some(ref font_name) = state.font_name {
752 if let Some(font_info) = self.font_cache.get(font_name) {
753 let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
755
756 if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
758 if !decoded.trim().is_empty()
760 && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
761 {
762 let sanitized = sanitize_extracted_text(&decoded);
764 tracing::debug!(
765 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
766 font_name,
767 text,
768 sanitized
769 );
770 return Ok(sanitized);
771 }
772 }
773
774 tracing::debug!(
775 "CMap decoding failed or produced garbage for font {}, falling back to encoding",
776 font_name
777 );
778 }
779 }
780
781 let encoding = if let Some(ref font_name) = state.font_name {
783 match font_name.to_lowercase().as_str() {
784 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
785 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
786 name if name.contains("standard") => TextEncoding::StandardEncoding,
787 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
788 _ => {
789 if font_name.starts_with("Times")
791 || font_name.starts_with("Helvetica")
792 || font_name.starts_with("Courier")
793 {
794 TextEncoding::WinAnsiEncoding } else {
796 TextEncoding::PdfDocEncoding }
798 }
799 }
800 } else {
801 TextEncoding::WinAnsiEncoding };
803
804 let fallback_result = encoding.decode(text);
805 let sanitized = sanitize_extracted_text(&fallback_result);
807 tracing::debug!(
808 "Fallback encoding decoding: {:?} -> \"{}\"",
809 text,
810 sanitized
811 );
812 Ok(sanitized)
813 }
814}
815
816impl Default for TextExtractor {
817 fn default() -> Self {
818 Self::new()
819 }
820}
821
822fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
824 [
825 a[0] * b[0] + a[1] * b[2],
826 a[0] * b[1] + a[1] * b[3],
827 a[2] * b[0] + a[3] * b[2],
828 a[2] * b[1] + a[3] * b[3],
829 a[4] * b[0] + a[5] * b[2] + b[4],
830 a[4] * b[1] + a[5] * b[3] + b[5],
831 ]
832}
833
834fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
836 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
837 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
838 (tx, ty)
839}
840
841fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
843 if let Some(font) = font_info {
845 if let Some(ref widths) = font.metrics.widths {
846 let first_char = font.metrics.first_char.unwrap_or(0);
847 let last_char = font.metrics.last_char.unwrap_or(255);
848 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
849
850 let mut total_width = 0.0;
851 let chars: Vec<char> = text.chars().collect();
852
853 for (i, &ch) in chars.iter().enumerate() {
854 let char_code = ch as u32;
855
856 let width = if char_code >= first_char && char_code <= last_char {
858 let index = (char_code - first_char) as usize;
859 widths.get(index).copied().unwrap_or(missing_width)
860 } else {
861 missing_width
862 };
863
864 total_width += width / 1000.0 * font_size;
866
867 if let Some(ref kerning) = font.metrics.kerning {
869 if i + 1 < chars.len() {
870 let next_char = chars[i + 1] as u32;
871 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
872 total_width += kern_value / 1000.0 * font_size;
874 }
875 }
876 }
877 }
878
879 return total_width;
880 }
881 }
882
883 text.len() as f64 * font_size * 0.5
885}
886
887pub fn sanitize_extracted_text(text: &str) -> String {
920 if text.is_empty() {
921 return String::new();
922 }
923
924 let mut result = String::with_capacity(text.len());
926 let mut chars = text.chars().peekable();
927 let mut last_was_space = false;
928
929 while let Some(ch) = chars.next() {
930 match ch {
931 '\0' => {
933 if chars.peek() == Some(&'\u{3}') {
935 chars.next(); }
937 if !last_was_space {
939 result.push(' ');
940 last_was_space = true;
941 }
942 }
943
944 '\u{3}' => {
946 }
948
949 '\t' | '\n' | '\r' => {
951 result.push(ch);
952 last_was_space = ch == '\t';
954 }
955
956 ' ' => {
958 if !last_was_space {
959 result.push(' ');
960 last_was_space = true;
961 }
962 }
963
964 c if c.is_ascii_control() => {
966 }
968
969 _ => {
971 result.push(ch);
972 last_was_space = false;
973 }
974 }
975 }
976
977 result
978}
979
980#[cfg(test)]
981mod tests {
982 use super::*;
983
984 #[test]
985 fn test_matrix_multiplication() {
986 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
987 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
988
989 let result = multiply_matrix(&identity, &translation);
990 assert_eq!(result, translation);
991
992 let result2 = multiply_matrix(&translation, &identity);
993 assert_eq!(result2, translation);
994 }
995
996 #[test]
997 fn test_transform_point() {
998 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
999 let (x, y) = transform_point(5.0, 5.0, &translation);
1000 assert_eq!(x, 15.0);
1001 assert_eq!(y, 25.0);
1002 }
1003
1004 #[test]
1005 fn test_extraction_options_default() {
1006 let options = ExtractionOptions::default();
1007 assert!(!options.preserve_layout);
1008 assert_eq!(options.space_threshold, 0.3);
1009 assert_eq!(options.newline_threshold, 10.0);
1010 assert!(options.sort_by_position);
1011 assert!(!options.detect_columns);
1012 assert_eq!(options.column_threshold, 50.0);
1013 assert!(options.merge_hyphenated);
1014 }
1015
1016 #[test]
1017 fn test_extraction_options_custom() {
1018 let options = ExtractionOptions {
1019 preserve_layout: true,
1020 space_threshold: 0.5,
1021 newline_threshold: 15.0,
1022 sort_by_position: false,
1023 detect_columns: true,
1024 column_threshold: 75.0,
1025 merge_hyphenated: false,
1026 };
1027 assert!(options.preserve_layout);
1028 assert_eq!(options.space_threshold, 0.5);
1029 assert_eq!(options.newline_threshold, 15.0);
1030 assert!(!options.sort_by_position);
1031 assert!(options.detect_columns);
1032 assert_eq!(options.column_threshold, 75.0);
1033 assert!(!options.merge_hyphenated);
1034 }
1035
1036 #[test]
1037 fn test_parse_font_style_bold() {
1038 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1040 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1041
1042 assert_eq!(parse_font_style("Arial Bold"), (true, false));
1044 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1045
1046 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1048 }
1049
1050 #[test]
1051 fn test_parse_font_style_italic() {
1052 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1054 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1055
1056 assert_eq!(parse_font_style("Arial Italic"), (false, true));
1058 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1059
1060 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1062 }
1063
1064 #[test]
1065 fn test_parse_font_style_bold_italic() {
1066 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1067 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1068 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1069 }
1070
1071 #[test]
1072 fn test_parse_font_style_regular() {
1073 assert_eq!(parse_font_style("Helvetica"), (false, false));
1074 assert_eq!(parse_font_style("Times-Roman"), (false, false));
1075 assert_eq!(parse_font_style("Courier"), (false, false));
1076 assert_eq!(parse_font_style("Arial"), (false, false));
1077 }
1078
1079 #[test]
1080 fn test_parse_font_style_edge_cases() {
1081 assert_eq!(parse_font_style(""), (false, false));
1083 assert_eq!(parse_font_style("UnknownFont"), (false, false));
1084
1085 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1087 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1088 }
1089
1090 #[test]
1091 fn test_text_fragment() {
1092 let fragment = TextFragment {
1093 text: "Hello".to_string(),
1094 x: 100.0,
1095 y: 200.0,
1096 width: 50.0,
1097 height: 12.0,
1098 font_size: 10.0,
1099 font_name: None,
1100 is_bold: false,
1101 is_italic: false,
1102 color: None,
1103 };
1104 assert_eq!(fragment.text, "Hello");
1105 assert_eq!(fragment.x, 100.0);
1106 assert_eq!(fragment.y, 200.0);
1107 assert_eq!(fragment.width, 50.0);
1108 assert_eq!(fragment.height, 12.0);
1109 assert_eq!(fragment.font_size, 10.0);
1110 }
1111
1112 #[test]
1113 fn test_extracted_text() {
1114 let fragments = vec![
1115 TextFragment {
1116 text: "Hello".to_string(),
1117 x: 100.0,
1118 y: 200.0,
1119 width: 50.0,
1120 height: 12.0,
1121 font_size: 10.0,
1122 font_name: None,
1123 is_bold: false,
1124 is_italic: false,
1125 color: None,
1126 },
1127 TextFragment {
1128 text: "World".to_string(),
1129 x: 160.0,
1130 y: 200.0,
1131 width: 50.0,
1132 height: 12.0,
1133 font_size: 10.0,
1134 font_name: None,
1135 is_bold: false,
1136 is_italic: false,
1137 color: None,
1138 },
1139 ];
1140
1141 let extracted = ExtractedText {
1142 text: "Hello World".to_string(),
1143 fragments: fragments,
1144 };
1145
1146 assert_eq!(extracted.text, "Hello World");
1147 assert_eq!(extracted.fragments.len(), 2);
1148 assert_eq!(extracted.fragments[0].text, "Hello");
1149 assert_eq!(extracted.fragments[1].text, "World");
1150 }
1151
1152 #[test]
1153 fn test_text_state_default() {
1154 let state = TextState::default();
1155 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1156 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1157 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1158 assert_eq!(state.leading, 0.0);
1159 assert_eq!(state.char_space, 0.0);
1160 assert_eq!(state.word_space, 0.0);
1161 assert_eq!(state.horizontal_scale, 100.0);
1162 assert_eq!(state.text_rise, 0.0);
1163 assert_eq!(state.font_size, 0.0);
1164 assert!(state.font_name.is_none());
1165 assert_eq!(state.render_mode, 0);
1166 }
1167
1168 #[test]
1169 fn test_matrix_operations() {
1170 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
1173 assert_eq!(x, 0.0);
1174 assert_eq!(y, 1.0);
1175
1176 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1178 let (x, y) = transform_point(5.0, 5.0, &scale);
1179 assert_eq!(x, 10.0);
1180 assert_eq!(y, 15.0);
1181
1182 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1184 let (x, y) = transform_point(1.0, 1.0, &complex);
1185 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
1188
1189 #[test]
1190 fn test_text_extractor_new() {
1191 let extractor = TextExtractor::new();
1192 let options = extractor.options;
1193 assert!(!options.preserve_layout);
1194 assert_eq!(options.space_threshold, 0.3);
1195 assert_eq!(options.newline_threshold, 10.0);
1196 assert!(options.sort_by_position);
1197 assert!(!options.detect_columns);
1198 assert_eq!(options.column_threshold, 50.0);
1199 assert!(options.merge_hyphenated);
1200 }
1201
1202 #[test]
1203 fn test_text_extractor_with_options() {
1204 let options = ExtractionOptions {
1205 preserve_layout: true,
1206 space_threshold: 0.3,
1207 newline_threshold: 12.0,
1208 sort_by_position: false,
1209 detect_columns: true,
1210 column_threshold: 60.0,
1211 merge_hyphenated: false,
1212 };
1213 let extractor = TextExtractor::with_options(options.clone());
1214 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1215 assert_eq!(extractor.options.space_threshold, options.space_threshold);
1216 assert_eq!(
1217 extractor.options.newline_threshold,
1218 options.newline_threshold
1219 );
1220 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1221 assert_eq!(extractor.options.detect_columns, options.detect_columns);
1222 assert_eq!(extractor.options.column_threshold, options.column_threshold);
1223 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1224 }
1225
1226 #[test]
1231 fn test_calculate_text_width_with_no_font_info() {
1232 let width = calculate_text_width("Hello", 12.0, None);
1234
1235 assert_eq!(
1237 width, 30.0,
1238 "Without font info, should use simplified calculation: len * font_size * 0.5"
1239 );
1240 }
1241
1242 #[test]
1243 fn test_calculate_text_width_with_empty_metrics() {
1244 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1245
1246 let font_info = FontInfo {
1248 name: "TestFont".to_string(),
1249 font_type: "Type1".to_string(),
1250 encoding: None,
1251 to_unicode: None,
1252 differences: None,
1253 descendant_font: None,
1254 cid_to_gid_map: None,
1255 metrics: FontMetrics {
1256 first_char: None,
1257 last_char: None,
1258 widths: None,
1259 missing_width: Some(500.0),
1260 kerning: None,
1261 },
1262 };
1263
1264 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1265
1266 assert_eq!(
1268 width, 30.0,
1269 "Without widths array, should fall back to simplified calculation"
1270 );
1271 }
1272
1273 #[test]
1274 fn test_calculate_text_width_with_complete_metrics() {
1275 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1276
1277 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
1288 name: "Helvetica".to_string(),
1289 font_type: "Type1".to_string(),
1290 encoding: None,
1291 to_unicode: None,
1292 differences: None,
1293 descendant_font: None,
1294 cid_to_gid_map: None,
1295 metrics: FontMetrics {
1296 first_char: Some(32),
1297 last_char: Some(126),
1298 widths: Some(widths),
1299 missing_width: Some(500.0),
1300 kerning: None,
1301 },
1302 };
1303
1304 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1305
1306 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1314 let tolerance = 0.0001; assert!(
1316 (width - expected).abs() < tolerance,
1317 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1318 expected,
1319 width,
1320 (width - expected).abs()
1321 );
1322
1323 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
1326 width, simplified,
1327 "Metrics-based calculation should differ from simplified (30.0)"
1328 );
1329 }
1330
1331 #[test]
1332 fn test_calculate_text_width_character_outside_range() {
1333 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1334
1335 let widths = vec![722.0; 26]; let font_info = FontInfo {
1339 name: "TestFont".to_string(),
1340 font_type: "Type1".to_string(),
1341 encoding: None,
1342 to_unicode: None,
1343 differences: None,
1344 descendant_font: None,
1345 cid_to_gid_map: None,
1346 metrics: FontMetrics {
1347 first_char: Some(65), last_char: Some(90), widths: Some(widths),
1350 missing_width: Some(500.0),
1351 kerning: None,
1352 },
1353 };
1354
1355 let width = calculate_text_width("A1", 10.0, Some(&font_info));
1357
1358 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1363 assert_eq!(
1364 width, expected,
1365 "Should use missing_width for characters outside range"
1366 );
1367 }
1368
1369 #[test]
1370 fn test_calculate_text_width_missing_width_in_array() {
1371 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1372
1373 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1378 name: "TestFont".to_string(),
1379 font_type: "Type1".to_string(),
1380 encoding: None,
1381 to_unicode: None,
1382 differences: None,
1383 descendant_font: None,
1384 cid_to_gid_map: None,
1385 metrics: FontMetrics {
1386 first_char: Some(32),
1387 last_char: Some(126),
1388 widths: Some(widths),
1389 missing_width: Some(600.0),
1390 kerning: None,
1391 },
1392 };
1393
1394 let char_code = 42u8 as char; let text = char_code.to_string();
1397 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1398
1399 assert_eq!(
1402 width, 0.0,
1403 "Should use 0.0 width from array, not missing_width"
1404 );
1405 }
1406
1407 #[test]
1408 fn test_calculate_text_width_empty_string() {
1409 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1410
1411 let font_info = FontInfo {
1412 name: "TestFont".to_string(),
1413 font_type: "Type1".to_string(),
1414 encoding: None,
1415 to_unicode: None,
1416 differences: None,
1417 descendant_font: None,
1418 cid_to_gid_map: None,
1419 metrics: FontMetrics {
1420 first_char: Some(32),
1421 last_char: Some(126),
1422 widths: Some(vec![500.0; 95]),
1423 missing_width: Some(500.0),
1424 kerning: None,
1425 },
1426 };
1427
1428 let width = calculate_text_width("", 12.0, Some(&font_info));
1429 assert_eq!(width, 0.0, "Empty string should have zero width");
1430
1431 let width_no_font = calculate_text_width("", 12.0, None);
1433 assert_eq!(
1434 width_no_font, 0.0,
1435 "Empty string should have zero width (no font)"
1436 );
1437 }
1438
1439 #[test]
1440 fn test_calculate_text_width_unicode_characters() {
1441 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1442
1443 let font_info = FontInfo {
1445 name: "TestFont".to_string(),
1446 font_type: "Type1".to_string(),
1447 encoding: None,
1448 to_unicode: None,
1449 differences: None,
1450 descendant_font: None,
1451 cid_to_gid_map: None,
1452 metrics: FontMetrics {
1453 first_char: Some(32),
1454 last_char: Some(126),
1455 widths: Some(vec![500.0; 95]),
1456 missing_width: Some(600.0),
1457 kerning: None,
1458 },
1459 };
1460
1461 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1463
1464 assert_eq!(
1467 width, 6.0,
1468 "Unicode character outside range should use missing_width"
1469 );
1470 }
1471
1472 #[test]
1473 fn test_calculate_text_width_different_font_sizes() {
1474 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1475
1476 let font_info = FontInfo {
1477 name: "TestFont".to_string(),
1478 font_type: "Type1".to_string(),
1479 encoding: None,
1480 to_unicode: None,
1481 differences: None,
1482 descendant_font: None,
1483 cid_to_gid_map: None,
1484 metrics: FontMetrics {
1485 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1488 missing_width: Some(500.0),
1489 kerning: None,
1490 },
1491 };
1492
1493 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1495 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1496
1497 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1499 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1500 assert_eq!(
1501 width_20,
1502 width_10 * 2.0,
1503 "Width should scale linearly with font size"
1504 );
1505 }
1506
1507 #[test]
1508 fn test_calculate_text_width_proportional_vs_monospace() {
1509 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1510
1511 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1514 name: "Helvetica".to_string(),
1515 font_type: "Type1".to_string(),
1516 encoding: None,
1517 to_unicode: None,
1518 differences: None,
1519 descendant_font: None,
1520 cid_to_gid_map: None,
1521 metrics: FontMetrics {
1522 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1525 missing_width: Some(500.0),
1526 kerning: None,
1527 },
1528 };
1529
1530 let monospace_widths = vec![600.0, 600.0, 600.0];
1532 let monospace_font = FontInfo {
1533 name: "Courier".to_string(),
1534 font_type: "Type1".to_string(),
1535 encoding: None,
1536 to_unicode: None,
1537 differences: None,
1538 descendant_font: None,
1539 cid_to_gid_map: None,
1540 metrics: FontMetrics {
1541 first_char: Some(105),
1542 last_char: Some(107),
1543 widths: Some(monospace_widths),
1544 missing_width: Some(600.0),
1545 kerning: None,
1546 },
1547 };
1548
1549 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1550 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1551
1552 assert!(
1554 prop_width < mono_width,
1555 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1556 prop_width,
1557 mono_width
1558 );
1559 }
1560
1561 #[test]
1566 fn test_calculate_text_width_with_kerning() {
1567 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1568 use std::collections::HashMap;
1569
1570 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1577 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1582 name: "Helvetica".to_string(),
1583 font_type: "Type1".to_string(),
1584 encoding: None,
1585 to_unicode: None,
1586 differences: None,
1587 descendant_font: None,
1588 cid_to_gid_map: None,
1589 metrics: FontMetrics {
1590 first_char: Some(32),
1591 last_char: Some(126),
1592 widths: Some(widths),
1593 missing_width: Some(500.0),
1594 kerning: Some(kerning),
1595 },
1596 };
1597
1598 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1600 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1603 let tolerance = 0.0001;
1604 assert!(
1605 (width_av - expected_av).abs() < tolerance,
1606 "AV with kerning: expected {}, got {}, diff {}",
1607 expected_av,
1608 width_av,
1609 (width_av - expected_av).abs()
1610 );
1611
1612 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1614 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1617 assert!(
1618 (width_aw - expected_aw).abs() < tolerance,
1619 "AW with kerning: expected {}, got {}, diff {}",
1620 expected_aw,
1621 width_aw,
1622 (width_aw - expected_aw).abs()
1623 );
1624
1625 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1627 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1629 assert!(
1630 (width_va - expected_va).abs() < tolerance,
1631 "VA without kerning: expected {}, got {}, diff {}",
1632 expected_va,
1633 width_va,
1634 (width_va - expected_va).abs()
1635 );
1636
1637 assert!(
1639 width_av < width_va,
1640 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1641 width_av,
1642 width_va
1643 );
1644 }
1645
1646 #[test]
1647 fn test_parse_truetype_kern_table_minimal() {
1648 use crate::text::extraction_cmap::parse_truetype_kern_table;
1649
1650 let mut ttf_data = vec![
1658 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1665
1666 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1680
1681 ttf_data.extend_from_slice(&[
1683 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1703
1704 let result = parse_truetype_kern_table(&ttf_data);
1705 assert!(
1706 result.is_ok(),
1707 "Should parse minimal kern table successfully: {:?}",
1708 result.err()
1709 );
1710
1711 let kerning_map = result.unwrap();
1712 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1713
1714 assert_eq!(
1716 kerning_map.get(&(65, 86)),
1717 Some(&-50.0),
1718 "Should have A+V kerning pair with value -50"
1719 );
1720
1721 assert_eq!(
1723 kerning_map.get(&(65, 87)),
1724 Some(&-40.0),
1725 "Should have A+W kerning pair with value -40"
1726 );
1727 }
1728
1729 #[test]
1730 fn test_parse_kern_table_no_kern_table() {
1731 use crate::text::extraction_cmap::extract_truetype_kerning;
1732
1733 let ttf_data = vec![
1738 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1751 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1752 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1753 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1754 ];
1755
1756 let result = extract_truetype_kerning(&ttf_data);
1757 assert!(
1758 result.is_ok(),
1759 "Should gracefully handle missing kern table"
1760 );
1761
1762 let kerning_map = result.unwrap();
1763 assert!(
1764 kerning_map.is_empty(),
1765 "Should return empty HashMap when no kern table exists"
1766 );
1767 }
1768}