1use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19 pub preserve_layout: bool,
21 pub space_threshold: f64,
23 pub newline_threshold: f64,
25 pub sort_by_position: bool,
27 pub detect_columns: bool,
29 pub column_threshold: f64,
31 pub merge_hyphenated: bool,
33}
34
35impl Default for ExtractionOptions {
36 fn default() -> Self {
37 Self {
38 preserve_layout: false,
39 space_threshold: 0.3,
40 newline_threshold: 10.0,
41 sort_by_position: true,
42 detect_columns: false,
43 column_threshold: 50.0,
44 merge_hyphenated: true,
45 }
46 }
47}
48
49#[derive(Debug, Clone)]
51pub struct ExtractedText {
52 pub text: String,
54 pub fragments: Vec<TextFragment>,
56}
57
58#[derive(Debug, Clone)]
60pub struct TextFragment {
61 pub text: String,
63 pub x: f64,
65 pub y: f64,
67 pub width: f64,
69 pub height: f64,
71 pub font_size: f64,
73 pub font_name: Option<String>,
75 pub is_bold: bool,
77 pub is_italic: bool,
79 pub color: Option<Color>,
81}
82
83struct TextState {
85 text_matrix: [f64; 6],
87 text_line_matrix: [f64; 6],
89 ctm: [f64; 6],
91 leading: f64,
93 char_space: f64,
95 word_space: f64,
97 horizontal_scale: f64,
99 text_rise: f64,
101 font_size: f64,
103 font_name: Option<String>,
105 render_mode: u8,
107 fill_color: Option<Color>,
109}
110
111impl Default for TextState {
112 fn default() -> Self {
113 Self {
114 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
115 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
116 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
117 leading: 0.0,
118 char_space: 0.0,
119 word_space: 0.0,
120 horizontal_scale: 100.0,
121 text_rise: 0.0,
122 font_size: 0.0,
123 font_name: None,
124 render_mode: 0,
125 fill_color: None,
126 }
127 }
128}
129
130pub fn parse_font_style(font_name: &str) -> (bool, bool) {
151 let name_lower = font_name.to_lowercase();
152
153 let is_bold = name_lower.contains("bold")
155 || name_lower.contains("-b")
156 || name_lower.contains(" b ")
157 || name_lower.ends_with(" b");
158
159 let is_italic = name_lower.contains("italic")
161 || name_lower.contains("oblique")
162 || name_lower.contains("-i")
163 || name_lower.contains(" i ")
164 || name_lower.ends_with(" i");
165
166 (is_bold, is_italic)
167}
168
169pub struct TextExtractor {
171 options: ExtractionOptions,
172 font_cache: HashMap<String, FontInfo>,
174}
175
176impl TextExtractor {
177 pub fn new() -> Self {
179 Self {
180 options: ExtractionOptions::default(),
181 font_cache: HashMap::new(),
182 }
183 }
184
185 pub fn with_options(options: ExtractionOptions) -> Self {
187 Self {
188 options,
189 font_cache: HashMap::new(),
190 }
191 }
192
193 pub fn extract_from_document<R: Read + Seek>(
195 &mut self,
196 document: &PdfDocument<R>,
197 ) -> ParseResult<Vec<ExtractedText>> {
198 let page_count = document.page_count()?;
199 let mut results = Vec::new();
200
201 for i in 0..page_count {
202 let text = self.extract_from_page(document, i)?;
203 results.push(text);
204 }
205
206 Ok(results)
207 }
208
209 pub fn extract_from_page<R: Read + Seek>(
211 &mut self,
212 document: &PdfDocument<R>,
213 page_index: u32,
214 ) -> ParseResult<ExtractedText> {
215 let page = document.get_page(page_index)?;
217
218 self.extract_font_resources(&page, document)?;
220
221 let streams = page.content_streams_with_document(document)?;
223
224 let mut extracted_text = String::new();
225 let mut fragments = Vec::new();
226 let mut state = TextState::default();
227 let mut in_text_object = false;
228 let mut last_x = 0.0;
229 let mut last_y = 0.0;
230
231 for (stream_idx, stream_data) in streams.iter().enumerate() {
233 let operations = match ContentParser::parse_content(stream_data) {
234 Ok(ops) => ops,
235 Err(e) => {
236 tracing::debug!(
238 "Warning: Failed to parse content stream on page {}, stream {}/{}",
239 page_index + 1,
240 stream_idx + 1,
241 streams.len()
242 );
243 tracing::debug!(" Error: {}", e);
244 tracing::debug!(" Stream size: {} bytes", stream_data.len());
245
246 let preview_len = stream_data.len().min(100);
248 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
249 tracing::debug!(
250 " Stream preview (first {} bytes): {:?}",
251 preview_len,
252 preview.chars().take(80).collect::<String>()
253 );
254
255 continue;
257 }
258 };
259
260 for op in operations {
261 match op {
262 ContentOperation::BeginText => {
263 in_text_object = true;
264 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
266 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
267 }
268
269 ContentOperation::EndText => {
270 in_text_object = false;
271 }
272
273 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
274 state.text_matrix =
275 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
276 state.text_line_matrix =
277 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
278 }
279
280 ContentOperation::MoveText(tx, ty) => {
281 let new_matrix = multiply_matrix(
283 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
284 &state.text_line_matrix,
285 );
286 state.text_matrix = new_matrix;
287 state.text_line_matrix = new_matrix;
288 }
289
290 ContentOperation::NextLine => {
291 let new_matrix = multiply_matrix(
293 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
294 &state.text_line_matrix,
295 );
296 state.text_matrix = new_matrix;
297 state.text_line_matrix = new_matrix;
298 }
299
300 ContentOperation::ShowText(text) => {
301 if in_text_object {
302 let text_bytes = &text;
303 let decoded = self.decode_text(text_bytes, &state)?;
304
305 let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
308 let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
309
310 if !extracted_text.is_empty() {
312 let dx = x - last_x;
313 let dy = (y - last_y).abs();
314
315 if dy > self.options.newline_threshold {
316 extracted_text.push('\n');
317 } else if dx > self.options.space_threshold * state.font_size {
318 extracted_text.push(' ');
319 }
320 }
321
322 extracted_text.push_str(&decoded);
323
324 let font_info = state
326 .font_name
327 .as_ref()
328 .and_then(|name| self.font_cache.get(name));
329
330 if self.options.preserve_layout {
331 let (is_bold, is_italic) = state
333 .font_name
334 .as_ref()
335 .map(|name| parse_font_style(name))
336 .unwrap_or((false, false));
337
338 fragments.push(TextFragment {
339 text: decoded.clone(),
340 x,
341 y,
342 width: calculate_text_width(
343 &decoded,
344 state.font_size,
345 font_info,
346 ),
347 height: state.font_size,
348 font_size: state.font_size,
349 font_name: state.font_name.clone(),
350 is_bold,
351 is_italic,
352 color: state.fill_color,
353 });
354 }
355
356 last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
358 last_y = y;
359
360 let text_width =
362 calculate_text_width(&decoded, state.font_size, font_info);
363 let tx = text_width * state.horizontal_scale / 100.0;
364 state.text_matrix =
365 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
366 }
367 }
368
369 ContentOperation::ShowTextArray(array) => {
370 if in_text_object {
371 let font_info = state
373 .font_name
374 .as_ref()
375 .and_then(|name| self.font_cache.get(name));
376
377 for item in array {
378 match item {
379 TextElement::Text(text_bytes) => {
380 let decoded = self.decode_text(&text_bytes, &state)?;
381 extracted_text.push_str(&decoded);
382
383 let text_width = calculate_text_width(
385 &decoded,
386 state.font_size,
387 font_info,
388 );
389 let tx = text_width * state.horizontal_scale / 100.0;
390 state.text_matrix = multiply_matrix(
391 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
392 &state.text_matrix,
393 );
394 }
395 TextElement::Spacing(adjustment) => {
396 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
398 state.text_matrix = multiply_matrix(
399 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
400 &state.text_matrix,
401 );
402 }
403 }
404 }
405 }
406 }
407
408 ContentOperation::SetFont(name, size) => {
409 state.font_name = Some(name);
410 state.font_size = size as f64;
411 }
412
413 ContentOperation::SetLeading(leading) => {
414 state.leading = leading as f64;
415 }
416
417 ContentOperation::SetCharSpacing(spacing) => {
418 state.char_space = spacing as f64;
419 }
420
421 ContentOperation::SetWordSpacing(spacing) => {
422 state.word_space = spacing as f64;
423 }
424
425 ContentOperation::SetHorizontalScaling(scale) => {
426 state.horizontal_scale = scale as f64;
427 }
428
429 ContentOperation::SetTextRise(rise) => {
430 state.text_rise = rise as f64;
431 }
432
433 ContentOperation::SetTextRenderMode(mode) => {
434 state.render_mode = mode as u8;
435 }
436
437 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
438 let [a0, b0, c0, d0, e0, f0] = state.ctm;
440 let a = a as f64;
441 let b = b as f64;
442 let c = c as f64;
443 let d = d as f64;
444 let e = e as f64;
445 let f = f as f64;
446 state.ctm = [
447 a * a0 + b * c0,
448 a * b0 + b * d0,
449 c * a0 + d * c0,
450 c * b0 + d * d0,
451 e * a0 + f * c0 + e0,
452 e * b0 + f * d0 + f0,
453 ];
454 }
455
456 ContentOperation::SetNonStrokingGray(gray) => {
458 state.fill_color = Some(Color::gray(gray as f64));
459 }
460
461 ContentOperation::SetNonStrokingRGB(r, g, b) => {
462 state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
463 }
464
465 ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
466 state.fill_color =
467 Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
468 }
469
470 _ => {
471 }
473 }
474 }
475 }
476
477 if self.options.sort_by_position && !fragments.is_empty() {
479 self.sort_and_merge_fragments(&mut fragments);
480 }
481
482 if self.options.preserve_layout && !fragments.is_empty() {
485 fragments = self.merge_close_fragments(&fragments);
486 }
487
488 if self.options.preserve_layout && !fragments.is_empty() {
490 extracted_text = self.reconstruct_text_from_fragments(&fragments);
491 }
492
493 Ok(ExtractedText {
494 text: extracted_text,
495 fragments,
496 })
497 }
498
499 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
501 fragments.sort_by(|a, b| {
503 let y_diff = (b.y - a.y).abs();
505 if y_diff < self.options.newline_threshold {
506 a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
508 } else {
509 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
511 }
512 });
513
514 if self.options.detect_columns {
516 self.detect_and_sort_columns(fragments);
517 }
518 }
519
520 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
522 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
524 let mut current_line: Vec<&mut TextFragment> = Vec::new();
525 let mut last_y = f64::INFINITY;
526
527 for fragment in fragments.iter_mut() {
528 let fragment_y = fragment.y;
529 if (last_y - fragment_y).abs() > self.options.newline_threshold
530 && !current_line.is_empty()
531 {
532 lines.push(current_line);
533 current_line = Vec::new();
534 }
535 current_line.push(fragment);
536 last_y = fragment_y;
537 }
538 if !current_line.is_empty() {
539 lines.push(current_line);
540 }
541
542 let mut column_boundaries = vec![0.0];
544 for line in &lines {
545 if line.len() > 1 {
546 for i in 0..line.len() - 1 {
547 let gap = line[i + 1].x - (line[i].x + line[i].width);
548 if gap > self.options.column_threshold {
549 let boundary = line[i].x + line[i].width + gap / 2.0;
550 if !column_boundaries
551 .iter()
552 .any(|&b| (b - boundary).abs() < 10.0)
553 {
554 column_boundaries.push(boundary);
555 }
556 }
557 }
558 }
559 }
560 column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
561
562 if column_boundaries.len() > 1 {
564 fragments.sort_by(|a, b| {
565 let col_a = column_boundaries
567 .iter()
568 .position(|&boundary| a.x < boundary)
569 .unwrap_or(column_boundaries.len())
570 - 1;
571 let col_b = column_boundaries
572 .iter()
573 .position(|&boundary| b.x < boundary)
574 .unwrap_or(column_boundaries.len())
575 - 1;
576
577 if col_a != col_b {
578 col_a.cmp(&col_b)
579 } else {
580 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
582 }
583 });
584 }
585 }
586
587 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
589 let merged_fragments = self.merge_close_fragments(fragments);
591
592 let mut result = String::new();
593 let mut last_y = f64::INFINITY;
594 let mut last_x = 0.0;
595 let mut last_line_ended_with_hyphen = false;
596
597 for fragment in &merged_fragments {
598 let y_diff = (last_y - fragment.y).abs();
600 if !result.is_empty() && y_diff > self.options.newline_threshold {
601 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
603 if result.ends_with('-') {
605 result.pop();
606 }
607 } else {
608 result.push('\n');
609 }
610 } else if !result.is_empty() {
611 let x_gap = fragment.x - last_x;
613 if x_gap > self.options.space_threshold * fragment.font_size {
614 result.push(' ');
615 }
616 }
617
618 result.push_str(&fragment.text);
619 last_line_ended_with_hyphen = fragment.text.ends_with('-');
620 last_y = fragment.y;
621 last_x = fragment.x + fragment.width;
622 }
623
624 result
625 }
626
627 fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
630 if fragments.is_empty() {
631 return Vec::new();
632 }
633
634 let mut merged = Vec::new();
635 let mut current = fragments[0].clone();
636
637 for fragment in &fragments[1..] {
638 let y_diff = (current.y - fragment.y).abs();
640 let x_gap = fragment.x - (current.x + current.width);
641
642 let should_merge = y_diff < 1.0 && x_gap >= 0.0 && x_gap < fragment.font_size * 0.5; if should_merge {
649 current.text.push_str(&fragment.text);
651 current.width = (fragment.x + fragment.width) - current.x;
652 } else {
653 merged.push(current);
655 current = fragment.clone();
656 }
657 }
658
659 merged.push(current);
660 merged
661 }
662
663 fn extract_font_resources<R: Read + Seek>(
665 &mut self,
666 page: &ParsedPage,
667 document: &PdfDocument<R>,
668 ) -> ParseResult<()> {
669 self.font_cache.clear();
671
672 if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
675 if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
676 {
677 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
678 for (font_name, font_obj) in font_dict.0.iter() {
680 if let Some(font_ref) = font_obj.as_reference() {
681 if let Ok(PdfObject::Dictionary(font_dict)) =
682 document.get_object(font_ref.0, font_ref.1)
683 {
684 let mut cmap_extractor: CMapTextExtractor<R> =
686 CMapTextExtractor::new();
687
688 if let Ok(font_info) =
689 cmap_extractor.extract_font_info(&font_dict, document)
690 {
691 let has_to_unicode = font_info.to_unicode.is_some();
692 self.font_cache.insert(font_name.0.clone(), font_info);
693 tracing::debug!(
694 "Cached font: {} (ToUnicode: {})",
695 font_name.0,
696 has_to_unicode
697 );
698 }
699 }
700 }
701 }
702 }
703 }
704 } else if let Some(resources) = page.get_resources() {
705 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
707 for (font_name, font_obj) in font_dict.0.iter() {
708 if let Some(font_ref) = font_obj.as_reference() {
709 if let Ok(PdfObject::Dictionary(font_dict)) =
710 document.get_object(font_ref.0, font_ref.1)
711 {
712 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
713
714 if let Ok(font_info) =
715 cmap_extractor.extract_font_info(&font_dict, document)
716 {
717 let has_to_unicode = font_info.to_unicode.is_some();
718 self.font_cache.insert(font_name.0.clone(), font_info);
719 tracing::debug!(
720 "Cached font: {} (ToUnicode: {})",
721 font_name.0,
722 has_to_unicode
723 );
724 }
725 }
726 }
727 }
728 }
729 }
730
731 Ok(())
732 }
733
734 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
736 use crate::text::encoding::TextEncoding;
737
738 if let Some(ref font_name) = state.font_name {
740 if let Some(font_info) = self.font_cache.get(font_name) {
741 let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
743
744 if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
746 if !decoded.trim().is_empty()
748 && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
749 {
750 let sanitized = sanitize_extracted_text(&decoded);
752 tracing::debug!(
753 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
754 font_name,
755 text,
756 sanitized
757 );
758 return Ok(sanitized);
759 }
760 }
761
762 tracing::debug!(
763 "CMap decoding failed or produced garbage for font {}, falling back to encoding",
764 font_name
765 );
766 }
767 }
768
769 let encoding = if let Some(ref font_name) = state.font_name {
771 match font_name.to_lowercase().as_str() {
772 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
773 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
774 name if name.contains("standard") => TextEncoding::StandardEncoding,
775 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
776 _ => {
777 if font_name.starts_with("Times")
779 || font_name.starts_with("Helvetica")
780 || font_name.starts_with("Courier")
781 {
782 TextEncoding::WinAnsiEncoding } else {
784 TextEncoding::PdfDocEncoding }
786 }
787 }
788 } else {
789 TextEncoding::WinAnsiEncoding };
791
792 let fallback_result = encoding.decode(text);
793 let sanitized = sanitize_extracted_text(&fallback_result);
795 tracing::debug!(
796 "Fallback encoding decoding: {:?} -> \"{}\"",
797 text,
798 sanitized
799 );
800 Ok(sanitized)
801 }
802}
803
804impl Default for TextExtractor {
805 fn default() -> Self {
806 Self::new()
807 }
808}
809
810fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
812 [
813 a[0] * b[0] + a[1] * b[2],
814 a[0] * b[1] + a[1] * b[3],
815 a[2] * b[0] + a[3] * b[2],
816 a[2] * b[1] + a[3] * b[3],
817 a[4] * b[0] + a[5] * b[2] + b[4],
818 a[4] * b[1] + a[5] * b[3] + b[5],
819 ]
820}
821
822fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
824 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
825 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
826 (tx, ty)
827}
828
829fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
831 if let Some(font) = font_info {
833 if let Some(ref widths) = font.metrics.widths {
834 let first_char = font.metrics.first_char.unwrap_or(0);
835 let last_char = font.metrics.last_char.unwrap_or(255);
836 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
837
838 let mut total_width = 0.0;
839 let chars: Vec<char> = text.chars().collect();
840
841 for (i, &ch) in chars.iter().enumerate() {
842 let char_code = ch as u32;
843
844 let width = if char_code >= first_char && char_code <= last_char {
846 let index = (char_code - first_char) as usize;
847 widths.get(index).copied().unwrap_or(missing_width)
848 } else {
849 missing_width
850 };
851
852 total_width += width / 1000.0 * font_size;
854
855 if let Some(ref kerning) = font.metrics.kerning {
857 if i + 1 < chars.len() {
858 let next_char = chars[i + 1] as u32;
859 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
860 total_width += kern_value / 1000.0 * font_size;
862 }
863 }
864 }
865 }
866
867 return total_width;
868 }
869 }
870
871 text.len() as f64 * font_size * 0.5
873}
874
875pub fn sanitize_extracted_text(text: &str) -> String {
908 if text.is_empty() {
909 return String::new();
910 }
911
912 let mut result = String::with_capacity(text.len());
914 let mut chars = text.chars().peekable();
915 let mut last_was_space = false;
916
917 while let Some(ch) = chars.next() {
918 match ch {
919 '\0' => {
921 if chars.peek() == Some(&'\u{3}') {
923 chars.next(); }
925 if !last_was_space {
927 result.push(' ');
928 last_was_space = true;
929 }
930 }
931
932 '\u{3}' => {
934 }
936
937 '\t' | '\n' | '\r' => {
939 result.push(ch);
940 last_was_space = ch == '\t';
942 }
943
944 ' ' => {
946 if !last_was_space {
947 result.push(' ');
948 last_was_space = true;
949 }
950 }
951
952 c if c.is_ascii_control() => {
954 }
956
957 _ => {
959 result.push(ch);
960 last_was_space = false;
961 }
962 }
963 }
964
965 result
966}
967
968#[cfg(test)]
969mod tests {
970 use super::*;
971
972 #[test]
973 fn test_matrix_multiplication() {
974 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
975 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
976
977 let result = multiply_matrix(&identity, &translation);
978 assert_eq!(result, translation);
979
980 let result2 = multiply_matrix(&translation, &identity);
981 assert_eq!(result2, translation);
982 }
983
984 #[test]
985 fn test_transform_point() {
986 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
987 let (x, y) = transform_point(5.0, 5.0, &translation);
988 assert_eq!(x, 15.0);
989 assert_eq!(y, 25.0);
990 }
991
992 #[test]
993 fn test_extraction_options_default() {
994 let options = ExtractionOptions::default();
995 assert!(!options.preserve_layout);
996 assert_eq!(options.space_threshold, 0.3);
997 assert_eq!(options.newline_threshold, 10.0);
998 assert!(options.sort_by_position);
999 assert!(!options.detect_columns);
1000 assert_eq!(options.column_threshold, 50.0);
1001 assert!(options.merge_hyphenated);
1002 }
1003
1004 #[test]
1005 fn test_extraction_options_custom() {
1006 let options = ExtractionOptions {
1007 preserve_layout: true,
1008 space_threshold: 0.5,
1009 newline_threshold: 15.0,
1010 sort_by_position: false,
1011 detect_columns: true,
1012 column_threshold: 75.0,
1013 merge_hyphenated: false,
1014 };
1015 assert!(options.preserve_layout);
1016 assert_eq!(options.space_threshold, 0.5);
1017 assert_eq!(options.newline_threshold, 15.0);
1018 assert!(!options.sort_by_position);
1019 assert!(options.detect_columns);
1020 assert_eq!(options.column_threshold, 75.0);
1021 assert!(!options.merge_hyphenated);
1022 }
1023
1024 #[test]
1025 fn test_parse_font_style_bold() {
1026 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1028 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1029
1030 assert_eq!(parse_font_style("Arial Bold"), (true, false));
1032 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1033
1034 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1036 }
1037
1038 #[test]
1039 fn test_parse_font_style_italic() {
1040 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1042 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1043
1044 assert_eq!(parse_font_style("Arial Italic"), (false, true));
1046 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1047
1048 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1050 }
1051
1052 #[test]
1053 fn test_parse_font_style_bold_italic() {
1054 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1055 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1056 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1057 }
1058
1059 #[test]
1060 fn test_parse_font_style_regular() {
1061 assert_eq!(parse_font_style("Helvetica"), (false, false));
1062 assert_eq!(parse_font_style("Times-Roman"), (false, false));
1063 assert_eq!(parse_font_style("Courier"), (false, false));
1064 assert_eq!(parse_font_style("Arial"), (false, false));
1065 }
1066
1067 #[test]
1068 fn test_parse_font_style_edge_cases() {
1069 assert_eq!(parse_font_style(""), (false, false));
1071 assert_eq!(parse_font_style("UnknownFont"), (false, false));
1072
1073 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1075 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1076 }
1077
1078 #[test]
1079 fn test_text_fragment() {
1080 let fragment = TextFragment {
1081 text: "Hello".to_string(),
1082 x: 100.0,
1083 y: 200.0,
1084 width: 50.0,
1085 height: 12.0,
1086 font_size: 10.0,
1087 font_name: None,
1088 is_bold: false,
1089 is_italic: false,
1090 color: None,
1091 };
1092 assert_eq!(fragment.text, "Hello");
1093 assert_eq!(fragment.x, 100.0);
1094 assert_eq!(fragment.y, 200.0);
1095 assert_eq!(fragment.width, 50.0);
1096 assert_eq!(fragment.height, 12.0);
1097 assert_eq!(fragment.font_size, 10.0);
1098 }
1099
1100 #[test]
1101 fn test_extracted_text() {
1102 let fragments = vec![
1103 TextFragment {
1104 text: "Hello".to_string(),
1105 x: 100.0,
1106 y: 200.0,
1107 width: 50.0,
1108 height: 12.0,
1109 font_size: 10.0,
1110 font_name: None,
1111 is_bold: false,
1112 is_italic: false,
1113 color: None,
1114 },
1115 TextFragment {
1116 text: "World".to_string(),
1117 x: 160.0,
1118 y: 200.0,
1119 width: 50.0,
1120 height: 12.0,
1121 font_size: 10.0,
1122 font_name: None,
1123 is_bold: false,
1124 is_italic: false,
1125 color: None,
1126 },
1127 ];
1128
1129 let extracted = ExtractedText {
1130 text: "Hello World".to_string(),
1131 fragments: fragments,
1132 };
1133
1134 assert_eq!(extracted.text, "Hello World");
1135 assert_eq!(extracted.fragments.len(), 2);
1136 assert_eq!(extracted.fragments[0].text, "Hello");
1137 assert_eq!(extracted.fragments[1].text, "World");
1138 }
1139
1140 #[test]
1141 fn test_text_state_default() {
1142 let state = TextState::default();
1143 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1144 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1145 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1146 assert_eq!(state.leading, 0.0);
1147 assert_eq!(state.char_space, 0.0);
1148 assert_eq!(state.word_space, 0.0);
1149 assert_eq!(state.horizontal_scale, 100.0);
1150 assert_eq!(state.text_rise, 0.0);
1151 assert_eq!(state.font_size, 0.0);
1152 assert!(state.font_name.is_none());
1153 assert_eq!(state.render_mode, 0);
1154 }
1155
1156 #[test]
1157 fn test_matrix_operations() {
1158 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
1161 assert_eq!(x, 0.0);
1162 assert_eq!(y, 1.0);
1163
1164 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1166 let (x, y) = transform_point(5.0, 5.0, &scale);
1167 assert_eq!(x, 10.0);
1168 assert_eq!(y, 15.0);
1169
1170 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1172 let (x, y) = transform_point(1.0, 1.0, &complex);
1173 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
1176
1177 #[test]
1178 fn test_text_extractor_new() {
1179 let extractor = TextExtractor::new();
1180 let options = extractor.options;
1181 assert!(!options.preserve_layout);
1182 assert_eq!(options.space_threshold, 0.3);
1183 assert_eq!(options.newline_threshold, 10.0);
1184 assert!(options.sort_by_position);
1185 assert!(!options.detect_columns);
1186 assert_eq!(options.column_threshold, 50.0);
1187 assert!(options.merge_hyphenated);
1188 }
1189
1190 #[test]
1191 fn test_text_extractor_with_options() {
1192 let options = ExtractionOptions {
1193 preserve_layout: true,
1194 space_threshold: 0.3,
1195 newline_threshold: 12.0,
1196 sort_by_position: false,
1197 detect_columns: true,
1198 column_threshold: 60.0,
1199 merge_hyphenated: false,
1200 };
1201 let extractor = TextExtractor::with_options(options.clone());
1202 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1203 assert_eq!(extractor.options.space_threshold, options.space_threshold);
1204 assert_eq!(
1205 extractor.options.newline_threshold,
1206 options.newline_threshold
1207 );
1208 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1209 assert_eq!(extractor.options.detect_columns, options.detect_columns);
1210 assert_eq!(extractor.options.column_threshold, options.column_threshold);
1211 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1212 }
1213
1214 #[test]
1219 fn test_calculate_text_width_with_no_font_info() {
1220 let width = calculate_text_width("Hello", 12.0, None);
1222
1223 assert_eq!(
1225 width, 30.0,
1226 "Without font info, should use simplified calculation: len * font_size * 0.5"
1227 );
1228 }
1229
1230 #[test]
1231 fn test_calculate_text_width_with_empty_metrics() {
1232 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1233
1234 let font_info = FontInfo {
1236 name: "TestFont".to_string(),
1237 font_type: "Type1".to_string(),
1238 encoding: None,
1239 to_unicode: None,
1240 differences: None,
1241 descendant_font: None,
1242 cid_to_gid_map: None,
1243 metrics: FontMetrics {
1244 first_char: None,
1245 last_char: None,
1246 widths: None,
1247 missing_width: Some(500.0),
1248 kerning: None,
1249 },
1250 };
1251
1252 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1253
1254 assert_eq!(
1256 width, 30.0,
1257 "Without widths array, should fall back to simplified calculation"
1258 );
1259 }
1260
1261 #[test]
1262 fn test_calculate_text_width_with_complete_metrics() {
1263 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1264
1265 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
1276 name: "Helvetica".to_string(),
1277 font_type: "Type1".to_string(),
1278 encoding: None,
1279 to_unicode: None,
1280 differences: None,
1281 descendant_font: None,
1282 cid_to_gid_map: None,
1283 metrics: FontMetrics {
1284 first_char: Some(32),
1285 last_char: Some(126),
1286 widths: Some(widths),
1287 missing_width: Some(500.0),
1288 kerning: None,
1289 },
1290 };
1291
1292 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1293
1294 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1302 let tolerance = 0.0001; assert!(
1304 (width - expected).abs() < tolerance,
1305 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1306 expected,
1307 width,
1308 (width - expected).abs()
1309 );
1310
1311 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
1314 width, simplified,
1315 "Metrics-based calculation should differ from simplified (30.0)"
1316 );
1317 }
1318
1319 #[test]
1320 fn test_calculate_text_width_character_outside_range() {
1321 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1322
1323 let widths = vec![722.0; 26]; let font_info = FontInfo {
1327 name: "TestFont".to_string(),
1328 font_type: "Type1".to_string(),
1329 encoding: None,
1330 to_unicode: None,
1331 differences: None,
1332 descendant_font: None,
1333 cid_to_gid_map: None,
1334 metrics: FontMetrics {
1335 first_char: Some(65), last_char: Some(90), widths: Some(widths),
1338 missing_width: Some(500.0),
1339 kerning: None,
1340 },
1341 };
1342
1343 let width = calculate_text_width("A1", 10.0, Some(&font_info));
1345
1346 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1351 assert_eq!(
1352 width, expected,
1353 "Should use missing_width for characters outside range"
1354 );
1355 }
1356
1357 #[test]
1358 fn test_calculate_text_width_missing_width_in_array() {
1359 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1360
1361 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1366 name: "TestFont".to_string(),
1367 font_type: "Type1".to_string(),
1368 encoding: None,
1369 to_unicode: None,
1370 differences: None,
1371 descendant_font: None,
1372 cid_to_gid_map: None,
1373 metrics: FontMetrics {
1374 first_char: Some(32),
1375 last_char: Some(126),
1376 widths: Some(widths),
1377 missing_width: Some(600.0),
1378 kerning: None,
1379 },
1380 };
1381
1382 let char_code = 42u8 as char; let text = char_code.to_string();
1385 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1386
1387 assert_eq!(
1390 width, 0.0,
1391 "Should use 0.0 width from array, not missing_width"
1392 );
1393 }
1394
1395 #[test]
1396 fn test_calculate_text_width_empty_string() {
1397 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1398
1399 let font_info = FontInfo {
1400 name: "TestFont".to_string(),
1401 font_type: "Type1".to_string(),
1402 encoding: None,
1403 to_unicode: None,
1404 differences: None,
1405 descendant_font: None,
1406 cid_to_gid_map: None,
1407 metrics: FontMetrics {
1408 first_char: Some(32),
1409 last_char: Some(126),
1410 widths: Some(vec![500.0; 95]),
1411 missing_width: Some(500.0),
1412 kerning: None,
1413 },
1414 };
1415
1416 let width = calculate_text_width("", 12.0, Some(&font_info));
1417 assert_eq!(width, 0.0, "Empty string should have zero width");
1418
1419 let width_no_font = calculate_text_width("", 12.0, None);
1421 assert_eq!(
1422 width_no_font, 0.0,
1423 "Empty string should have zero width (no font)"
1424 );
1425 }
1426
1427 #[test]
1428 fn test_calculate_text_width_unicode_characters() {
1429 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1430
1431 let font_info = FontInfo {
1433 name: "TestFont".to_string(),
1434 font_type: "Type1".to_string(),
1435 encoding: None,
1436 to_unicode: None,
1437 differences: None,
1438 descendant_font: None,
1439 cid_to_gid_map: None,
1440 metrics: FontMetrics {
1441 first_char: Some(32),
1442 last_char: Some(126),
1443 widths: Some(vec![500.0; 95]),
1444 missing_width: Some(600.0),
1445 kerning: None,
1446 },
1447 };
1448
1449 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1451
1452 assert_eq!(
1455 width, 6.0,
1456 "Unicode character outside range should use missing_width"
1457 );
1458 }
1459
1460 #[test]
1461 fn test_calculate_text_width_different_font_sizes() {
1462 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1463
1464 let font_info = FontInfo {
1465 name: "TestFont".to_string(),
1466 font_type: "Type1".to_string(),
1467 encoding: None,
1468 to_unicode: None,
1469 differences: None,
1470 descendant_font: None,
1471 cid_to_gid_map: None,
1472 metrics: FontMetrics {
1473 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1476 missing_width: Some(500.0),
1477 kerning: None,
1478 },
1479 };
1480
1481 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1483 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1484
1485 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1487 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1488 assert_eq!(
1489 width_20,
1490 width_10 * 2.0,
1491 "Width should scale linearly with font size"
1492 );
1493 }
1494
1495 #[test]
1496 fn test_calculate_text_width_proportional_vs_monospace() {
1497 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1498
1499 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1502 name: "Helvetica".to_string(),
1503 font_type: "Type1".to_string(),
1504 encoding: None,
1505 to_unicode: None,
1506 differences: None,
1507 descendant_font: None,
1508 cid_to_gid_map: None,
1509 metrics: FontMetrics {
1510 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1513 missing_width: Some(500.0),
1514 kerning: None,
1515 },
1516 };
1517
1518 let monospace_widths = vec![600.0, 600.0, 600.0];
1520 let monospace_font = FontInfo {
1521 name: "Courier".to_string(),
1522 font_type: "Type1".to_string(),
1523 encoding: None,
1524 to_unicode: None,
1525 differences: None,
1526 descendant_font: None,
1527 cid_to_gid_map: None,
1528 metrics: FontMetrics {
1529 first_char: Some(105),
1530 last_char: Some(107),
1531 widths: Some(monospace_widths),
1532 missing_width: Some(600.0),
1533 kerning: None,
1534 },
1535 };
1536
1537 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1538 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1539
1540 assert!(
1542 prop_width < mono_width,
1543 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1544 prop_width,
1545 mono_width
1546 );
1547 }
1548
1549 #[test]
1554 fn test_calculate_text_width_with_kerning() {
1555 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1556 use std::collections::HashMap;
1557
1558 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1565 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1570 name: "Helvetica".to_string(),
1571 font_type: "Type1".to_string(),
1572 encoding: None,
1573 to_unicode: None,
1574 differences: None,
1575 descendant_font: None,
1576 cid_to_gid_map: None,
1577 metrics: FontMetrics {
1578 first_char: Some(32),
1579 last_char: Some(126),
1580 widths: Some(widths),
1581 missing_width: Some(500.0),
1582 kerning: Some(kerning),
1583 },
1584 };
1585
1586 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1588 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1591 let tolerance = 0.0001;
1592 assert!(
1593 (width_av - expected_av).abs() < tolerance,
1594 "AV with kerning: expected {}, got {}, diff {}",
1595 expected_av,
1596 width_av,
1597 (width_av - expected_av).abs()
1598 );
1599
1600 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1602 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1605 assert!(
1606 (width_aw - expected_aw).abs() < tolerance,
1607 "AW with kerning: expected {}, got {}, diff {}",
1608 expected_aw,
1609 width_aw,
1610 (width_aw - expected_aw).abs()
1611 );
1612
1613 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1615 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1617 assert!(
1618 (width_va - expected_va).abs() < tolerance,
1619 "VA without kerning: expected {}, got {}, diff {}",
1620 expected_va,
1621 width_va,
1622 (width_va - expected_va).abs()
1623 );
1624
1625 assert!(
1627 width_av < width_va,
1628 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1629 width_av,
1630 width_va
1631 );
1632 }
1633
1634 #[test]
1635 fn test_parse_truetype_kern_table_minimal() {
1636 use crate::text::extraction_cmap::parse_truetype_kern_table;
1637
1638 let mut ttf_data = vec![
1646 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1653
1654 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1668
1669 ttf_data.extend_from_slice(&[
1671 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1691
1692 let result = parse_truetype_kern_table(&ttf_data);
1693 assert!(
1694 result.is_ok(),
1695 "Should parse minimal kern table successfully: {:?}",
1696 result.err()
1697 );
1698
1699 let kerning_map = result.unwrap();
1700 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1701
1702 assert_eq!(
1704 kerning_map.get(&(65, 86)),
1705 Some(&-50.0),
1706 "Should have A+V kerning pair with value -50"
1707 );
1708
1709 assert_eq!(
1711 kerning_map.get(&(65, 87)),
1712 Some(&-40.0),
1713 "Should have A+W kerning pair with value -40"
1714 );
1715 }
1716
1717 #[test]
1718 fn test_parse_kern_table_no_kern_table() {
1719 use crate::text::extraction_cmap::extract_truetype_kerning;
1720
1721 let ttf_data = vec![
1726 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1739 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1740 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1741 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1742 ];
1743
1744 let result = extract_truetype_kerning(&ttf_data);
1745 assert!(
1746 result.is_ok(),
1747 "Should gracefully handle missing kern table"
1748 );
1749
1750 let kerning_map = result.unwrap();
1751 assert!(
1752 kerning_map.is_empty(),
1753 "Should return empty HashMap when no kern table exists"
1754 );
1755 }
1756}