1use crate::parser::content::{ContentOperation, ContentParser, TextElement};
7use crate::parser::document::PdfDocument;
8use crate::parser::objects::PdfObject;
9use crate::parser::page_tree::ParsedPage;
10use crate::parser::ParseResult;
11use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
12use std::collections::HashMap;
13use std::io::{Read, Seek};
14
15#[derive(Debug, Clone)]
17pub struct ExtractionOptions {
18 pub preserve_layout: bool,
20 pub space_threshold: f64,
22 pub newline_threshold: f64,
24 pub sort_by_position: bool,
26 pub detect_columns: bool,
28 pub column_threshold: f64,
30 pub merge_hyphenated: bool,
32}
33
34impl Default for ExtractionOptions {
35 fn default() -> Self {
36 Self {
37 preserve_layout: false,
38 space_threshold: 0.2,
39 newline_threshold: 10.0,
40 sort_by_position: true,
41 detect_columns: false,
42 column_threshold: 50.0,
43 merge_hyphenated: true,
44 }
45 }
46}
47
48#[derive(Debug, Clone)]
50pub struct ExtractedText {
51 pub text: String,
53 pub fragments: Vec<TextFragment>,
55}
56
57#[derive(Debug, Clone)]
59pub struct TextFragment {
60 pub text: String,
62 pub x: f64,
64 pub y: f64,
66 pub width: f64,
68 pub height: f64,
70 pub font_size: f64,
72}
73
74struct TextState {
76 text_matrix: [f64; 6],
78 text_line_matrix: [f64; 6],
80 #[allow(dead_code)]
82 ctm: [f64; 6],
83 leading: f64,
85 char_space: f64,
87 word_space: f64,
89 horizontal_scale: f64,
91 text_rise: f64,
93 font_size: f64,
95 font_name: Option<String>,
97 render_mode: u8,
99}
100
101impl Default for TextState {
102 fn default() -> Self {
103 Self {
104 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
105 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
106 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
107 leading: 0.0,
108 char_space: 0.0,
109 word_space: 0.0,
110 horizontal_scale: 100.0,
111 text_rise: 0.0,
112 font_size: 0.0,
113 font_name: None,
114 render_mode: 0,
115 }
116 }
117}
118
119pub struct TextExtractor {
121 options: ExtractionOptions,
122 font_cache: HashMap<String, FontInfo>,
124}
125
126impl TextExtractor {
127 pub fn new() -> Self {
129 Self {
130 options: ExtractionOptions::default(),
131 font_cache: HashMap::new(),
132 }
133 }
134
135 pub fn with_options(options: ExtractionOptions) -> Self {
137 Self {
138 options,
139 font_cache: HashMap::new(),
140 }
141 }
142
143 pub fn extract_from_document<R: Read + Seek>(
145 &mut self,
146 document: &PdfDocument<R>,
147 ) -> ParseResult<Vec<ExtractedText>> {
148 let page_count = document.page_count()?;
149 let mut results = Vec::new();
150
151 for i in 0..page_count {
152 let text = self.extract_from_page(document, i)?;
153 results.push(text);
154 }
155
156 Ok(results)
157 }
158
159 pub fn extract_from_page<R: Read + Seek>(
161 &mut self,
162 document: &PdfDocument<R>,
163 page_index: u32,
164 ) -> ParseResult<ExtractedText> {
165 let page = document.get_page(page_index)?;
167
168 self.extract_font_resources(&page, document)?;
170
171 let streams = page.content_streams_with_document(document)?;
173
174 let mut extracted_text = String::new();
175 let mut fragments = Vec::new();
176 let mut state = TextState::default();
177 let mut in_text_object = false;
178 let mut last_x = 0.0;
179 let mut last_y = 0.0;
180
181 for stream_data in streams {
183 let operations = match ContentParser::parse_content(&stream_data) {
184 Ok(ops) => ops,
185 Err(e) => {
186 eprintln!("Warning: Failed to parse content stream, skipping: {}", e);
188 continue;
189 }
190 };
191
192 for op in operations {
193 match op {
194 ContentOperation::BeginText => {
195 in_text_object = true;
196 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
198 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
199 }
200
201 ContentOperation::EndText => {
202 in_text_object = false;
203 }
204
205 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
206 state.text_matrix =
207 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
208 state.text_line_matrix =
209 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
210 }
211
212 ContentOperation::MoveText(tx, ty) => {
213 let new_matrix = multiply_matrix(
215 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
216 &state.text_line_matrix,
217 );
218 state.text_matrix = new_matrix;
219 state.text_line_matrix = new_matrix;
220 }
221
222 ContentOperation::NextLine => {
223 let new_matrix = multiply_matrix(
225 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
226 &state.text_line_matrix,
227 );
228 state.text_matrix = new_matrix;
229 state.text_line_matrix = new_matrix;
230 }
231
232 ContentOperation::ShowText(text) => {
233 if in_text_object {
234 let text_bytes = &text;
235 let decoded = self.decode_text(text_bytes, &state)?;
236
237 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
239
240 if !extracted_text.is_empty() {
242 let dx = x - last_x;
243 let dy = (y - last_y).abs();
244
245 if dy > self.options.newline_threshold {
246 extracted_text.push('\n');
247 } else if dx > self.options.space_threshold * state.font_size {
248 extracted_text.push(' ');
249 }
250 }
251
252 extracted_text.push_str(&decoded);
253
254 let font_info = state
256 .font_name
257 .as_ref()
258 .and_then(|name| self.font_cache.get(name));
259
260 if self.options.preserve_layout {
261 fragments.push(TextFragment {
262 text: decoded.clone(),
263 x,
264 y,
265 width: calculate_text_width(
266 &decoded,
267 state.font_size,
268 font_info,
269 ),
270 height: state.font_size,
271 font_size: state.font_size,
272 });
273 }
274
275 last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
277 last_y = y;
278
279 let text_width =
281 calculate_text_width(&decoded, state.font_size, font_info);
282 let tx = text_width * state.horizontal_scale / 100.0;
283 state.text_matrix =
284 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
285 }
286 }
287
288 ContentOperation::ShowTextArray(array) => {
289 if in_text_object {
290 let font_info = state
292 .font_name
293 .as_ref()
294 .and_then(|name| self.font_cache.get(name));
295
296 for item in array {
297 match item {
298 TextElement::Text(text_bytes) => {
299 let decoded = self.decode_text(&text_bytes, &state)?;
300 extracted_text.push_str(&decoded);
301
302 let text_width = calculate_text_width(
304 &decoded,
305 state.font_size,
306 font_info,
307 );
308 let tx = text_width * state.horizontal_scale / 100.0;
309 state.text_matrix = multiply_matrix(
310 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
311 &state.text_matrix,
312 );
313 }
314 TextElement::Spacing(adjustment) => {
315 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
317 state.text_matrix = multiply_matrix(
318 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
319 &state.text_matrix,
320 );
321 }
322 }
323 }
324 }
325 }
326
327 ContentOperation::SetFont(name, size) => {
328 state.font_name = Some(name);
329 state.font_size = size as f64;
330 }
331
332 ContentOperation::SetLeading(leading) => {
333 state.leading = leading as f64;
334 }
335
336 ContentOperation::SetCharSpacing(spacing) => {
337 state.char_space = spacing as f64;
338 }
339
340 ContentOperation::SetWordSpacing(spacing) => {
341 state.word_space = spacing as f64;
342 }
343
344 ContentOperation::SetHorizontalScaling(scale) => {
345 state.horizontal_scale = scale as f64;
346 }
347
348 ContentOperation::SetTextRise(rise) => {
349 state.text_rise = rise as f64;
350 }
351
352 ContentOperation::SetTextRenderMode(mode) => {
353 state.render_mode = mode as u8;
354 }
355
356 _ => {
357 }
359 }
360 }
361 }
362
363 if self.options.sort_by_position && !fragments.is_empty() {
365 self.sort_and_merge_fragments(&mut fragments);
366 }
367
368 if self.options.preserve_layout && !fragments.is_empty() {
370 extracted_text = self.reconstruct_text_from_fragments(&fragments);
371 }
372
373 Ok(ExtractedText {
374 text: extracted_text,
375 fragments,
376 })
377 }
378
379 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
381 fragments.sort_by(|a, b| {
383 let y_diff = (b.y - a.y).abs();
385 if y_diff < self.options.newline_threshold {
386 a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
388 } else {
389 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
391 }
392 });
393
394 if self.options.detect_columns {
396 self.detect_and_sort_columns(fragments);
397 }
398 }
399
400 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
402 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
404 let mut current_line: Vec<&mut TextFragment> = Vec::new();
405 let mut last_y = f64::INFINITY;
406
407 for fragment in fragments.iter_mut() {
408 let fragment_y = fragment.y;
409 if (last_y - fragment_y).abs() > self.options.newline_threshold
410 && !current_line.is_empty()
411 {
412 lines.push(current_line);
413 current_line = Vec::new();
414 }
415 current_line.push(fragment);
416 last_y = fragment_y;
417 }
418 if !current_line.is_empty() {
419 lines.push(current_line);
420 }
421
422 let mut column_boundaries = vec![0.0];
424 for line in &lines {
425 if line.len() > 1 {
426 for i in 0..line.len() - 1 {
427 let gap = line[i + 1].x - (line[i].x + line[i].width);
428 if gap > self.options.column_threshold {
429 let boundary = line[i].x + line[i].width + gap / 2.0;
430 if !column_boundaries
431 .iter()
432 .any(|&b| (b - boundary).abs() < 10.0)
433 {
434 column_boundaries.push(boundary);
435 }
436 }
437 }
438 }
439 }
440 column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
441
442 if column_boundaries.len() > 1 {
444 fragments.sort_by(|a, b| {
445 let col_a = column_boundaries
447 .iter()
448 .position(|&boundary| a.x < boundary)
449 .unwrap_or(column_boundaries.len())
450 - 1;
451 let col_b = column_boundaries
452 .iter()
453 .position(|&boundary| b.x < boundary)
454 .unwrap_or(column_boundaries.len())
455 - 1;
456
457 if col_a != col_b {
458 col_a.cmp(&col_b)
459 } else {
460 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
462 }
463 });
464 }
465 }
466
467 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
469 let mut result = String::new();
470 let mut last_y = f64::INFINITY;
471 let mut last_x = 0.0;
472 let mut last_line_ended_with_hyphen = false;
473
474 for fragment in fragments {
475 let y_diff = (last_y - fragment.y).abs();
477 if !result.is_empty() && y_diff > self.options.newline_threshold {
478 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
480 if result.ends_with('-') {
482 result.pop();
483 }
484 } else {
485 result.push('\n');
486 }
487 } else if !result.is_empty() {
488 let x_gap = fragment.x - last_x;
490 if x_gap > self.options.space_threshold * fragment.font_size {
491 result.push(' ');
492 }
493 }
494
495 result.push_str(&fragment.text);
496 last_line_ended_with_hyphen = fragment.text.ends_with('-');
497 last_y = fragment.y;
498 last_x = fragment.x + fragment.width;
499 }
500
501 result
502 }
503
504 fn extract_font_resources<R: Read + Seek>(
506 &mut self,
507 page: &ParsedPage,
508 document: &PdfDocument<R>,
509 ) -> ParseResult<()> {
510 self.font_cache.clear();
512
513 if let Some(resources) = page.get_resources() {
515 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
516 for (font_name, font_obj) in font_dict.0.iter() {
518 if let Some(font_ref) = font_obj.as_reference() {
519 if let Ok(PdfObject::Dictionary(font_dict)) =
520 document.get_object(font_ref.0, font_ref.1)
521 {
522 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
524
525 if let Ok(font_info) =
526 cmap_extractor.extract_font_info(&font_dict, document)
527 {
528 self.font_cache.insert(font_name.0.clone(), font_info);
529 tracing::debug!(
530 "Cached font: {} -> {:?}",
531 font_name.0,
532 self.font_cache.get(&font_name.0)
533 );
534 }
535 }
536 }
537 }
538 }
539 }
540
541 Ok(())
542 }
543
544 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
546 use crate::text::encoding::TextEncoding;
547
548 if let Some(ref font_name) = state.font_name {
550 if let Some(font_info) = self.font_cache.get(font_name) {
551 let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
553
554 if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
556 tracing::debug!(
557 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
558 font_name,
559 text,
560 decoded
561 );
562 return Ok(decoded);
563 }
564
565 tracing::debug!(
566 "CMap decoding failed for font {}, falling back to encoding",
567 font_name
568 );
569 }
570 }
571
572 let encoding = if let Some(ref font_name) = state.font_name {
574 match font_name.to_lowercase().as_str() {
575 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
576 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
577 name if name.contains("standard") => TextEncoding::StandardEncoding,
578 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
579 _ => {
580 if font_name.starts_with("Times")
582 || font_name.starts_with("Helvetica")
583 || font_name.starts_with("Courier")
584 {
585 TextEncoding::WinAnsiEncoding } else {
587 TextEncoding::PdfDocEncoding }
589 }
590 }
591 } else {
592 TextEncoding::WinAnsiEncoding };
594
595 let fallback_result = encoding.decode(text);
596 tracing::debug!(
597 "Fallback encoding decoding: {:?} -> \"{}\"",
598 text,
599 fallback_result
600 );
601 Ok(fallback_result)
602 }
603}
604
605impl Default for TextExtractor {
606 fn default() -> Self {
607 Self::new()
608 }
609}
610
611fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
613 [
614 a[0] * b[0] + a[1] * b[2],
615 a[0] * b[1] + a[1] * b[3],
616 a[2] * b[0] + a[3] * b[2],
617 a[2] * b[1] + a[3] * b[3],
618 a[4] * b[0] + a[5] * b[2] + b[4],
619 a[4] * b[1] + a[5] * b[3] + b[5],
620 ]
621}
622
623fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
625 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
626 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
627 (tx, ty)
628}
629
630fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
632 if let Some(font) = font_info {
634 if let Some(ref widths) = font.metrics.widths {
635 let first_char = font.metrics.first_char.unwrap_or(0);
636 let last_char = font.metrics.last_char.unwrap_or(255);
637 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
638
639 let mut total_width = 0.0;
640 let chars: Vec<char> = text.chars().collect();
641
642 for (i, &ch) in chars.iter().enumerate() {
643 let char_code = ch as u32;
644
645 let width = if char_code >= first_char && char_code <= last_char {
647 let index = (char_code - first_char) as usize;
648 widths.get(index).copied().unwrap_or(missing_width)
649 } else {
650 missing_width
651 };
652
653 total_width += width / 1000.0 * font_size;
655
656 if let Some(ref kerning) = font.metrics.kerning {
658 if i + 1 < chars.len() {
659 let next_char = chars[i + 1] as u32;
660 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
661 total_width += kern_value / 1000.0 * font_size;
663 }
664 }
665 }
666 }
667
668 return total_width;
669 }
670 }
671
672 text.len() as f64 * font_size * 0.5
674}
675
676#[cfg(test)]
677mod tests {
678 use super::*;
679
680 #[test]
681 fn test_matrix_multiplication() {
682 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
683 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
684
685 let result = multiply_matrix(&identity, &translation);
686 assert_eq!(result, translation);
687
688 let result2 = multiply_matrix(&translation, &identity);
689 assert_eq!(result2, translation);
690 }
691
692 #[test]
693 fn test_transform_point() {
694 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
695 let (x, y) = transform_point(5.0, 5.0, &translation);
696 assert_eq!(x, 15.0);
697 assert_eq!(y, 25.0);
698 }
699
700 #[test]
701 fn test_extraction_options_default() {
702 let options = ExtractionOptions::default();
703 assert!(!options.preserve_layout);
704 assert_eq!(options.space_threshold, 0.2);
705 assert_eq!(options.newline_threshold, 10.0);
706 assert!(options.sort_by_position);
707 assert!(!options.detect_columns);
708 assert_eq!(options.column_threshold, 50.0);
709 assert!(options.merge_hyphenated);
710 }
711
712 #[test]
713 fn test_extraction_options_custom() {
714 let options = ExtractionOptions {
715 preserve_layout: true,
716 space_threshold: 0.5,
717 newline_threshold: 15.0,
718 sort_by_position: false,
719 detect_columns: true,
720 column_threshold: 75.0,
721 merge_hyphenated: false,
722 };
723 assert!(options.preserve_layout);
724 assert_eq!(options.space_threshold, 0.5);
725 assert_eq!(options.newline_threshold, 15.0);
726 assert!(!options.sort_by_position);
727 assert!(options.detect_columns);
728 assert_eq!(options.column_threshold, 75.0);
729 assert!(!options.merge_hyphenated);
730 }
731
732 #[test]
733 fn test_text_fragment() {
734 let fragment = TextFragment {
735 text: "Hello".to_string(),
736 x: 100.0,
737 y: 200.0,
738 width: 50.0,
739 height: 12.0,
740 font_size: 10.0,
741 };
742 assert_eq!(fragment.text, "Hello");
743 assert_eq!(fragment.x, 100.0);
744 assert_eq!(fragment.y, 200.0);
745 assert_eq!(fragment.width, 50.0);
746 assert_eq!(fragment.height, 12.0);
747 assert_eq!(fragment.font_size, 10.0);
748 }
749
750 #[test]
751 fn test_extracted_text() {
752 let fragments = vec![
753 TextFragment {
754 text: "Hello".to_string(),
755 x: 100.0,
756 y: 200.0,
757 width: 50.0,
758 height: 12.0,
759 font_size: 10.0,
760 },
761 TextFragment {
762 text: "World".to_string(),
763 x: 160.0,
764 y: 200.0,
765 width: 50.0,
766 height: 12.0,
767 font_size: 10.0,
768 },
769 ];
770
771 let extracted = ExtractedText {
772 text: "Hello World".to_string(),
773 fragments: fragments.clone(),
774 };
775
776 assert_eq!(extracted.text, "Hello World");
777 assert_eq!(extracted.fragments.len(), 2);
778 assert_eq!(extracted.fragments[0].text, "Hello");
779 assert_eq!(extracted.fragments[1].text, "World");
780 }
781
782 #[test]
783 fn test_text_state_default() {
784 let state = TextState::default();
785 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
786 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
787 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
788 assert_eq!(state.leading, 0.0);
789 assert_eq!(state.char_space, 0.0);
790 assert_eq!(state.word_space, 0.0);
791 assert_eq!(state.horizontal_scale, 100.0);
792 assert_eq!(state.text_rise, 0.0);
793 assert_eq!(state.font_size, 0.0);
794 assert!(state.font_name.is_none());
795 assert_eq!(state.render_mode, 0);
796 }
797
798 #[test]
799 fn test_matrix_operations() {
800 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
803 assert_eq!(x, 0.0);
804 assert_eq!(y, 1.0);
805
806 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
808 let (x, y) = transform_point(5.0, 5.0, &scale);
809 assert_eq!(x, 10.0);
810 assert_eq!(y, 15.0);
811
812 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
814 let (x, y) = transform_point(1.0, 1.0, &complex);
815 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
818
819 #[test]
820 fn test_text_extractor_new() {
821 let extractor = TextExtractor::new();
822 let options = extractor.options;
823 assert!(!options.preserve_layout);
824 assert_eq!(options.space_threshold, 0.2);
825 assert_eq!(options.newline_threshold, 10.0);
826 assert!(options.sort_by_position);
827 assert!(!options.detect_columns);
828 assert_eq!(options.column_threshold, 50.0);
829 assert!(options.merge_hyphenated);
830 }
831
832 #[test]
833 fn test_text_extractor_with_options() {
834 let options = ExtractionOptions {
835 preserve_layout: true,
836 space_threshold: 0.3,
837 newline_threshold: 12.0,
838 sort_by_position: false,
839 detect_columns: true,
840 column_threshold: 60.0,
841 merge_hyphenated: false,
842 };
843 let extractor = TextExtractor::with_options(options.clone());
844 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
845 assert_eq!(extractor.options.space_threshold, options.space_threshold);
846 assert_eq!(
847 extractor.options.newline_threshold,
848 options.newline_threshold
849 );
850 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
851 assert_eq!(extractor.options.detect_columns, options.detect_columns);
852 assert_eq!(extractor.options.column_threshold, options.column_threshold);
853 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
854 }
855
856 #[test]
861 fn test_calculate_text_width_with_no_font_info() {
862 let width = calculate_text_width("Hello", 12.0, None);
864
865 assert_eq!(
867 width, 30.0,
868 "Without font info, should use simplified calculation: len * font_size * 0.5"
869 );
870 }
871
872 #[test]
873 fn test_calculate_text_width_with_empty_metrics() {
874 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
875
876 let font_info = FontInfo {
878 name: "TestFont".to_string(),
879 font_type: "Type1".to_string(),
880 encoding: None,
881 to_unicode: None,
882 differences: None,
883 descendant_font: None,
884 cid_to_gid_map: None,
885 metrics: FontMetrics {
886 first_char: None,
887 last_char: None,
888 widths: None,
889 missing_width: Some(500.0),
890 kerning: None,
891 },
892 };
893
894 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
895
896 assert_eq!(
898 width, 30.0,
899 "Without widths array, should fall back to simplified calculation"
900 );
901 }
902
903 #[test]
904 fn test_calculate_text_width_with_complete_metrics() {
905 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
906
907 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
918 name: "Helvetica".to_string(),
919 font_type: "Type1".to_string(),
920 encoding: None,
921 to_unicode: None,
922 differences: None,
923 descendant_font: None,
924 cid_to_gid_map: None,
925 metrics: FontMetrics {
926 first_char: Some(32),
927 last_char: Some(126),
928 widths: Some(widths),
929 missing_width: Some(500.0),
930 kerning: None,
931 },
932 };
933
934 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
935
936 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
944 let tolerance = 0.0001; assert!(
946 (width - expected).abs() < tolerance,
947 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
948 expected,
949 width,
950 (width - expected).abs()
951 );
952
953 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
956 width, simplified,
957 "Metrics-based calculation should differ from simplified (30.0)"
958 );
959 }
960
961 #[test]
962 fn test_calculate_text_width_character_outside_range() {
963 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
964
965 let widths = vec![722.0; 26]; let font_info = FontInfo {
969 name: "TestFont".to_string(),
970 font_type: "Type1".to_string(),
971 encoding: None,
972 to_unicode: None,
973 differences: None,
974 descendant_font: None,
975 cid_to_gid_map: None,
976 metrics: FontMetrics {
977 first_char: Some(65), last_char: Some(90), widths: Some(widths),
980 missing_width: Some(500.0),
981 kerning: None,
982 },
983 };
984
985 let width = calculate_text_width("A1", 10.0, Some(&font_info));
987
988 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
993 assert_eq!(
994 width, expected,
995 "Should use missing_width for characters outside range"
996 );
997 }
998
999 #[test]
1000 fn test_calculate_text_width_missing_width_in_array() {
1001 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1002
1003 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1008 name: "TestFont".to_string(),
1009 font_type: "Type1".to_string(),
1010 encoding: None,
1011 to_unicode: None,
1012 differences: None,
1013 descendant_font: None,
1014 cid_to_gid_map: None,
1015 metrics: FontMetrics {
1016 first_char: Some(32),
1017 last_char: Some(126),
1018 widths: Some(widths),
1019 missing_width: Some(600.0),
1020 kerning: None,
1021 },
1022 };
1023
1024 let char_code = 42u8 as char; let text = char_code.to_string();
1027 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1028
1029 assert_eq!(
1032 width, 0.0,
1033 "Should use 0.0 width from array, not missing_width"
1034 );
1035 }
1036
1037 #[test]
1038 fn test_calculate_text_width_empty_string() {
1039 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1040
1041 let font_info = FontInfo {
1042 name: "TestFont".to_string(),
1043 font_type: "Type1".to_string(),
1044 encoding: None,
1045 to_unicode: None,
1046 differences: None,
1047 descendant_font: None,
1048 cid_to_gid_map: None,
1049 metrics: FontMetrics {
1050 first_char: Some(32),
1051 last_char: Some(126),
1052 widths: Some(vec![500.0; 95]),
1053 missing_width: Some(500.0),
1054 kerning: None,
1055 },
1056 };
1057
1058 let width = calculate_text_width("", 12.0, Some(&font_info));
1059 assert_eq!(width, 0.0, "Empty string should have zero width");
1060
1061 let width_no_font = calculate_text_width("", 12.0, None);
1063 assert_eq!(
1064 width_no_font, 0.0,
1065 "Empty string should have zero width (no font)"
1066 );
1067 }
1068
1069 #[test]
1070 fn test_calculate_text_width_unicode_characters() {
1071 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1072
1073 let font_info = FontInfo {
1075 name: "TestFont".to_string(),
1076 font_type: "Type1".to_string(),
1077 encoding: None,
1078 to_unicode: None,
1079 differences: None,
1080 descendant_font: None,
1081 cid_to_gid_map: None,
1082 metrics: FontMetrics {
1083 first_char: Some(32),
1084 last_char: Some(126),
1085 widths: Some(vec![500.0; 95]),
1086 missing_width: Some(600.0),
1087 kerning: None,
1088 },
1089 };
1090
1091 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1093
1094 assert_eq!(
1097 width, 6.0,
1098 "Unicode character outside range should use missing_width"
1099 );
1100 }
1101
1102 #[test]
1103 fn test_calculate_text_width_different_font_sizes() {
1104 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1105
1106 let font_info = FontInfo {
1107 name: "TestFont".to_string(),
1108 font_type: "Type1".to_string(),
1109 encoding: None,
1110 to_unicode: None,
1111 differences: None,
1112 descendant_font: None,
1113 cid_to_gid_map: None,
1114 metrics: FontMetrics {
1115 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1118 missing_width: Some(500.0),
1119 kerning: None,
1120 },
1121 };
1122
1123 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1125 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1126
1127 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1129 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1130 assert_eq!(
1131 width_20,
1132 width_10 * 2.0,
1133 "Width should scale linearly with font size"
1134 );
1135 }
1136
1137 #[test]
1138 fn test_calculate_text_width_proportional_vs_monospace() {
1139 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1140
1141 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1144 name: "Helvetica".to_string(),
1145 font_type: "Type1".to_string(),
1146 encoding: None,
1147 to_unicode: None,
1148 differences: None,
1149 descendant_font: None,
1150 cid_to_gid_map: None,
1151 metrics: FontMetrics {
1152 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1155 missing_width: Some(500.0),
1156 kerning: None,
1157 },
1158 };
1159
1160 let monospace_widths = vec![600.0, 600.0, 600.0];
1162 let monospace_font = FontInfo {
1163 name: "Courier".to_string(),
1164 font_type: "Type1".to_string(),
1165 encoding: None,
1166 to_unicode: None,
1167 differences: None,
1168 descendant_font: None,
1169 cid_to_gid_map: None,
1170 metrics: FontMetrics {
1171 first_char: Some(105),
1172 last_char: Some(107),
1173 widths: Some(monospace_widths),
1174 missing_width: Some(600.0),
1175 kerning: None,
1176 },
1177 };
1178
1179 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1180 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1181
1182 assert!(
1184 prop_width < mono_width,
1185 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1186 prop_width,
1187 mono_width
1188 );
1189 }
1190
1191 #[test]
1196 fn test_calculate_text_width_with_kerning() {
1197 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1198 use std::collections::HashMap;
1199
1200 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1207 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1212 name: "Helvetica".to_string(),
1213 font_type: "Type1".to_string(),
1214 encoding: None,
1215 to_unicode: None,
1216 differences: None,
1217 descendant_font: None,
1218 cid_to_gid_map: None,
1219 metrics: FontMetrics {
1220 first_char: Some(32),
1221 last_char: Some(126),
1222 widths: Some(widths),
1223 missing_width: Some(500.0),
1224 kerning: Some(kerning),
1225 },
1226 };
1227
1228 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1230 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1233 let tolerance = 0.0001;
1234 assert!(
1235 (width_av - expected_av).abs() < tolerance,
1236 "AV with kerning: expected {}, got {}, diff {}",
1237 expected_av,
1238 width_av,
1239 (width_av - expected_av).abs()
1240 );
1241
1242 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1244 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1247 assert!(
1248 (width_aw - expected_aw).abs() < tolerance,
1249 "AW with kerning: expected {}, got {}, diff {}",
1250 expected_aw,
1251 width_aw,
1252 (width_aw - expected_aw).abs()
1253 );
1254
1255 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1257 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1259 assert!(
1260 (width_va - expected_va).abs() < tolerance,
1261 "VA without kerning: expected {}, got {}, diff {}",
1262 expected_va,
1263 width_va,
1264 (width_va - expected_va).abs()
1265 );
1266
1267 assert!(
1269 width_av < width_va,
1270 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1271 width_av,
1272 width_va
1273 );
1274 }
1275
1276 #[test]
1277 fn test_parse_truetype_kern_table_minimal() {
1278 use crate::text::extraction_cmap::parse_truetype_kern_table;
1279
1280 let mut ttf_data = vec![
1288 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1295
1296 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1310
1311 ttf_data.extend_from_slice(&[
1313 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1333
1334 let result = parse_truetype_kern_table(&ttf_data);
1335 assert!(
1336 result.is_ok(),
1337 "Should parse minimal kern table successfully: {:?}",
1338 result.err()
1339 );
1340
1341 let kerning_map = result.unwrap();
1342 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1343
1344 assert_eq!(
1346 kerning_map.get(&(65, 86)),
1347 Some(&-50.0),
1348 "Should have A+V kerning pair with value -50"
1349 );
1350
1351 assert_eq!(
1353 kerning_map.get(&(65, 87)),
1354 Some(&-40.0),
1355 "Should have A+W kerning pair with value -40"
1356 );
1357 }
1358
1359 #[test]
1360 fn test_parse_kern_table_no_kern_table() {
1361 use crate::text::extraction_cmap::extract_truetype_kerning;
1362
1363 let ttf_data = vec![
1368 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1381 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1382 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1383 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1384 ];
1385
1386 let result = extract_truetype_kerning(&ttf_data);
1387 assert!(
1388 result.is_ok(),
1389 "Should gracefully handle missing kern table"
1390 );
1391
1392 let kerning_map = result.unwrap();
1393 assert!(
1394 kerning_map.is_empty(),
1395 "Should return empty HashMap when no kern table exists"
1396 );
1397 }
1398}