1use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19 pub preserve_layout: bool,
21 pub space_threshold: f64,
23 pub newline_threshold: f64,
25 pub sort_by_position: bool,
27 pub detect_columns: bool,
29 pub column_threshold: f64,
31 pub merge_hyphenated: bool,
33 pub track_space_decisions: bool,
36}
37
38impl Default for ExtractionOptions {
39 fn default() -> Self {
40 Self {
41 preserve_layout: false,
42 space_threshold: 0.3,
43 newline_threshold: 10.0,
44 sort_by_position: true,
45 detect_columns: false,
46 column_threshold: 50.0,
47 merge_hyphenated: true,
48 track_space_decisions: false,
49 }
50 }
51}
52
53#[derive(Debug, Clone)]
55pub struct ExtractedText {
56 pub text: String,
58 pub fragments: Vec<TextFragment>,
60}
61
62#[derive(Debug, Clone)]
65pub struct SpaceDecision {
66 pub offset: usize,
68 pub dx: f64,
70 pub threshold: f64,
72 pub confidence: f64,
74 pub inserted: bool,
76}
77
78#[derive(Debug, Clone)]
80pub struct TextFragment {
81 pub text: String,
83 pub x: f64,
85 pub y: f64,
87 pub width: f64,
89 pub height: f64,
91 pub font_size: f64,
93 pub font_name: Option<String>,
95 pub is_bold: bool,
97 pub is_italic: bool,
99 pub color: Option<Color>,
101 pub space_decisions: Vec<SpaceDecision>,
103}
104
105struct TextState {
107 text_matrix: [f64; 6],
109 text_line_matrix: [f64; 6],
111 ctm: [f64; 6],
113 leading: f64,
115 char_space: f64,
117 word_space: f64,
119 horizontal_scale: f64,
121 text_rise: f64,
123 font_size: f64,
125 font_name: Option<String>,
127 render_mode: u8,
129 fill_color: Option<Color>,
131}
132
133impl Default for TextState {
134 fn default() -> Self {
135 Self {
136 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
137 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
138 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139 leading: 0.0,
140 char_space: 0.0,
141 word_space: 0.0,
142 horizontal_scale: 100.0,
143 text_rise: 0.0,
144 font_size: 0.0,
145 font_name: None,
146 render_mode: 0,
147 fill_color: None,
148 }
149 }
150}
151
152pub fn parse_font_style(font_name: &str) -> (bool, bool) {
173 let name_lower = font_name.to_lowercase();
174
175 let is_bold = name_lower.contains("bold")
177 || name_lower.contains("-b")
178 || name_lower.contains(" b ")
179 || name_lower.ends_with(" b");
180
181 let is_italic = name_lower.contains("italic")
183 || name_lower.contains("oblique")
184 || name_lower.contains("-i")
185 || name_lower.contains(" i ")
186 || name_lower.ends_with(" i");
187
188 (is_bold, is_italic)
189}
190
191pub struct TextExtractor {
193 options: ExtractionOptions,
194 font_cache: HashMap<String, FontInfo>,
196 font_object_cache: HashMap<(u32, u16), FontInfo>,
199}
200
201impl TextExtractor {
202 pub fn new() -> Self {
204 Self {
205 options: ExtractionOptions::default(),
206 font_cache: HashMap::new(),
207 font_object_cache: HashMap::new(),
208 }
209 }
210
211 pub fn with_options(options: ExtractionOptions) -> Self {
213 Self {
214 options,
215 font_cache: HashMap::new(),
216 font_object_cache: HashMap::new(),
217 }
218 }
219
220 pub fn extract_from_document<R: Read + Seek>(
222 &mut self,
223 document: &PdfDocument<R>,
224 ) -> ParseResult<Vec<ExtractedText>> {
225 let page_count = document.page_count()?;
226 let mut results = Vec::new();
227
228 for i in 0..page_count {
229 let text = self.extract_from_page(document, i)?;
230 results.push(text);
231 }
232
233 Ok(results)
234 }
235
236 pub fn extract_from_page<R: Read + Seek>(
238 &mut self,
239 document: &PdfDocument<R>,
240 page_index: u32,
241 ) -> ParseResult<ExtractedText> {
242 let page = document.get_page(page_index)?;
244
245 {
247 let _span = tracing::info_span!("font_resources").entered();
248 self.extract_font_resources(&page, document)?;
249 }
250
251 let streams = {
253 let _span = tracing::info_span!("stream_decompress").entered();
254 page.content_streams_with_document(document)?
255 };
256
257 let mut extracted_text = String::new();
258 let mut fragments = Vec::new();
259 let mut state = TextState::default();
260 let mut in_text_object = false;
261 let mut last_x = 0.0;
262 let mut last_y = 0.0;
263
264 for (stream_idx, stream_data) in streams.iter().enumerate() {
266 let operations = match {
267 let _span = tracing::info_span!("content_parse").entered();
268 ContentParser::parse_content(stream_data)
269 } {
270 Ok(ops) => ops,
271 Err(e) => {
272 tracing::debug!(
274 "Warning: Failed to parse content stream on page {}, stream {}/{}",
275 page_index + 1,
276 stream_idx + 1,
277 streams.len()
278 );
279 tracing::debug!(" Error: {}", e);
280 tracing::debug!(" Stream size: {} bytes", stream_data.len());
281
282 let preview_len = stream_data.len().min(100);
284 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
285 tracing::debug!(
286 " Stream preview (first {} bytes): {:?}",
287 preview_len,
288 preview.chars().take(80).collect::<String>()
289 );
290
291 continue;
293 }
294 };
295
296 let _ops_span = tracing::info_span!("text_ops_loop").entered();
297 for op in operations {
298 match op {
299 ContentOperation::BeginText => {
300 in_text_object = true;
301 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
303 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
304 }
305
306 ContentOperation::EndText => {
307 in_text_object = false;
308 }
309
310 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
311 state.text_matrix =
312 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
313 state.text_line_matrix =
314 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
315 }
316
317 ContentOperation::MoveText(tx, ty) => {
318 let new_matrix = multiply_matrix(
320 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
321 &state.text_line_matrix,
322 );
323 state.text_matrix = new_matrix;
324 state.text_line_matrix = new_matrix;
325 }
326
327 ContentOperation::NextLine => {
328 let new_matrix = multiply_matrix(
330 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
331 &state.text_line_matrix,
332 );
333 state.text_matrix = new_matrix;
334 state.text_line_matrix = new_matrix;
335 }
336
337 ContentOperation::ShowText(text) => {
338 if in_text_object {
339 let text_bytes = &text;
340 let decoded = self.decode_text(text_bytes, &state)?;
341
342 let (x, y) = text_origin(&state);
344
345 if !extracted_text.is_empty() {
347 let dx = x - last_x;
348 let dy = (y - last_y).abs();
349
350 if dy > self.options.newline_threshold {
351 extracted_text.push('\n');
352 } else if dx > self.options.space_threshold * state.font_size {
353 extracted_text.push(' ');
354 }
355 }
356
357 extracted_text.push_str(&decoded);
358
359 let font_info = state
361 .font_name
362 .as_ref()
363 .and_then(|name| self.font_cache.get(name));
364
365 let text_width =
367 calculate_text_width(&decoded, state.font_size, font_info);
368
369 if self.options.preserve_layout {
370 emit_text_fragment(
371 &mut fragments,
372 &decoded,
373 text_width,
374 x,
375 y,
376 &state,
377 );
378 }
379
380 last_x = x + text_width;
382 last_y = y;
383
384 let tx = text_width * state.horizontal_scale / 100.0;
386 state.text_matrix =
387 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
388 }
389 }
390
391 ContentOperation::ShowTextArray(array) => {
392 if in_text_object {
393 let font_info = state
395 .font_name
396 .as_ref()
397 .and_then(|name| self.font_cache.get(name));
398
399 for item in array {
400 match item {
401 TextElement::Text(text_bytes) => {
402 let decoded = self.decode_text(&text_bytes, &state)?;
403 extracted_text.push_str(&decoded);
404
405 let text_width = calculate_text_width(
406 &decoded,
407 state.font_size,
408 font_info,
409 );
410
411 if self.options.preserve_layout {
412 let (x, y) = text_origin(&state);
413 emit_text_fragment(
414 &mut fragments,
415 &decoded,
416 text_width,
417 x,
418 y,
419 &state,
420 );
421 }
422
423 let tx = text_width * state.horizontal_scale / 100.0;
424 state.text_matrix = multiply_matrix(
425 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
426 &state.text_matrix,
427 );
428 }
429 TextElement::Spacing(adjustment) => {
430 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
432 state.text_matrix = multiply_matrix(
433 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
434 &state.text_matrix,
435 );
436 }
437 }
438 }
439 }
440 }
441
442 ContentOperation::NextLineShowText(text) => {
443 if in_text_object {
444 let new_matrix = multiply_matrix(
446 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
447 &state.text_line_matrix,
448 );
449 state.text_matrix = new_matrix;
450 state.text_line_matrix = new_matrix;
451
452 let decoded = self.decode_text(&text, &state)?;
453 let (x, y) = text_origin(&state);
454
455 if !extracted_text.is_empty() {
456 extracted_text.push('\n');
457 }
458 extracted_text.push_str(&decoded);
459
460 let font_info = state
461 .font_name
462 .as_ref()
463 .and_then(|name| self.font_cache.get(name));
464 let text_width =
465 calculate_text_width(&decoded, state.font_size, font_info);
466
467 if self.options.preserve_layout {
468 emit_text_fragment(
469 &mut fragments,
470 &decoded,
471 text_width,
472 x,
473 y,
474 &state,
475 );
476 }
477
478 last_x = x + text_width;
479 last_y = y;
480
481 let tx = text_width * state.horizontal_scale / 100.0;
482 state.text_matrix =
483 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
484 }
485 }
486
487 ContentOperation::SetSpacingNextLineShowText(word_space, char_space, text) => {
488 if in_text_object {
489 state.word_space = word_space as f64;
493 state.char_space = char_space as f64;
494
495 let new_matrix = multiply_matrix(
496 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
497 &state.text_line_matrix,
498 );
499 state.text_matrix = new_matrix;
500 state.text_line_matrix = new_matrix;
501
502 let decoded = self.decode_text(&text, &state)?;
503 let (x, y) = text_origin(&state);
504
505 if !extracted_text.is_empty() {
506 extracted_text.push('\n');
507 }
508 extracted_text.push_str(&decoded);
509
510 let font_info = state
511 .font_name
512 .as_ref()
513 .and_then(|name| self.font_cache.get(name));
514 let text_width =
515 calculate_text_width(&decoded, state.font_size, font_info);
516
517 if self.options.preserve_layout {
518 emit_text_fragment(
519 &mut fragments,
520 &decoded,
521 text_width,
522 x,
523 y,
524 &state,
525 );
526 }
527
528 last_x = x + text_width;
529 last_y = y;
530
531 let tx = text_width * state.horizontal_scale / 100.0;
532 state.text_matrix =
533 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
534 }
535 }
536
537 ContentOperation::SetFont(name, size) => {
538 state.font_name = Some(name);
539 state.font_size = size as f64;
540 }
541
542 ContentOperation::SetLeading(leading) => {
543 state.leading = leading as f64;
544 }
545
546 ContentOperation::SetCharSpacing(spacing) => {
547 state.char_space = spacing as f64;
548 }
549
550 ContentOperation::SetWordSpacing(spacing) => {
551 state.word_space = spacing as f64;
552 }
553
554 ContentOperation::SetHorizontalScaling(scale) => {
555 state.horizontal_scale = scale as f64;
556 }
557
558 ContentOperation::SetTextRise(rise) => {
559 state.text_rise = rise as f64;
560 }
561
562 ContentOperation::SetTextRenderMode(mode) => {
563 state.render_mode = mode as u8;
564 }
565
566 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
567 let [a0, b0, c0, d0, e0, f0] = state.ctm;
569 let a = a as f64;
570 let b = b as f64;
571 let c = c as f64;
572 let d = d as f64;
573 let e = e as f64;
574 let f = f as f64;
575 state.ctm = [
576 a * a0 + b * c0,
577 a * b0 + b * d0,
578 c * a0 + d * c0,
579 c * b0 + d * d0,
580 e * a0 + f * c0 + e0,
581 e * b0 + f * d0 + f0,
582 ];
583 }
584
585 ContentOperation::SetNonStrokingGray(gray) => {
587 state.fill_color = Some(Color::gray(gray as f64));
588 }
589
590 ContentOperation::SetNonStrokingRGB(r, g, b) => {
591 state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
592 }
593
594 ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
595 state.fill_color =
596 Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
597 }
598
599 _ => {
600 }
602 }
603 }
604 }
605
606 {
607 let _span = tracing::info_span!("layout_finalize").entered();
608
609 if self.options.sort_by_position && !fragments.is_empty() {
611 self.sort_and_merge_fragments(&mut fragments);
612 }
613
614 if self.options.preserve_layout && !fragments.is_empty() {
617 fragments = self.merge_close_fragments(&fragments);
618 }
619
620 if self.options.preserve_layout && !fragments.is_empty() {
622 extracted_text = self.reconstruct_text_from_fragments(&fragments);
623 }
624 }
625
626 Ok(ExtractedText {
627 text: extracted_text,
628 fragments,
629 })
630 }
631
632 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
634 let threshold = self.options.newline_threshold;
642 fragments.sort_by(|a, b| {
643 let band_a = if threshold > 0.0 {
645 (-a.y / threshold).round()
646 } else {
647 -a.y
648 };
649 let band_b = if threshold > 0.0 {
650 (-b.y / threshold).round()
651 } else {
652 -b.y
653 };
654
655 band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
657 });
658
659 if self.options.detect_columns {
661 self.detect_and_sort_columns(fragments);
662 }
663 }
664
665 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
667 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
669 let mut current_line: Vec<&mut TextFragment> = Vec::new();
670 let mut last_y = f64::INFINITY;
671
672 for fragment in fragments.iter_mut() {
673 let fragment_y = fragment.y;
674 if (last_y - fragment_y).abs() > self.options.newline_threshold
675 && !current_line.is_empty()
676 {
677 lines.push(current_line);
678 current_line = Vec::new();
679 }
680 current_line.push(fragment);
681 last_y = fragment_y;
682 }
683 if !current_line.is_empty() {
684 lines.push(current_line);
685 }
686
687 let mut column_boundaries = vec![0.0];
689 for line in &lines {
690 if line.len() > 1 {
691 for i in 0..line.len() - 1 {
692 let gap = line[i + 1].x - (line[i].x + line[i].width);
693 if gap > self.options.column_threshold {
694 let boundary = line[i].x + line[i].width + gap / 2.0;
695 if !column_boundaries
696 .iter()
697 .any(|&b| (b - boundary).abs() < 10.0)
698 {
699 column_boundaries.push(boundary);
700 }
701 }
702 }
703 }
704 }
705 column_boundaries.sort_by(|a, b| a.total_cmp(b));
706
707 if column_boundaries.len() > 1 {
709 fragments.sort_by(|a, b| {
710 let col_a = column_boundaries
712 .iter()
713 .position(|&boundary| a.x < boundary)
714 .unwrap_or(column_boundaries.len())
715 - 1;
716 let col_b = column_boundaries
717 .iter()
718 .position(|&boundary| b.x < boundary)
719 .unwrap_or(column_boundaries.len())
720 - 1;
721
722 if col_a != col_b {
723 col_a.cmp(&col_b)
724 } else {
725 b.y.total_cmp(&a.y)
727 }
728 });
729 }
730 }
731
732 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
734 let merged_fragments = self.merge_close_fragments(fragments);
736
737 let mut result = String::new();
738 let mut last_y = f64::INFINITY;
739 let mut last_x = 0.0;
740 let mut last_line_ended_with_hyphen = false;
741
742 for fragment in &merged_fragments {
743 let y_diff = (last_y - fragment.y).abs();
745 if !result.is_empty() && y_diff > self.options.newline_threshold {
746 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
748 if result.ends_with('-') {
750 result.pop();
751 }
752 } else {
753 result.push('\n');
754 }
755 } else if !result.is_empty() {
756 let x_gap = fragment.x - last_x;
758 if x_gap > self.options.space_threshold * fragment.font_size {
759 result.push(' ');
760 }
761 }
762
763 result.push_str(&fragment.text);
764 last_line_ended_with_hyphen = fragment.text.ends_with('-');
765 last_y = fragment.y;
766 last_x = fragment.x + fragment.width;
767 }
768
769 result
770 }
771
772 fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
775 if fragments.is_empty() {
776 return Vec::new();
777 }
778
779 let mut merged = Vec::new();
780 let mut current = fragments[0].clone();
781
782 for fragment in &fragments[1..] {
783 let y_diff = (current.y - fragment.y).abs();
785 let x_gap = fragment.x - (current.x + current.width);
786
787 let should_merge = y_diff < 1.0 && x_gap >= 0.0 && x_gap < fragment.font_size * 0.5; if should_merge {
794 current.text.push_str(&fragment.text);
796 current.width = (fragment.x + fragment.width) - current.x;
797 } else {
798 merged.push(current);
800 current = fragment.clone();
801 }
802 }
803
804 merged.push(current);
805 merged
806 }
807
808 fn extract_font_resources<R: Read + Seek>(
814 &mut self,
815 page: &ParsedPage,
816 document: &PdfDocument<R>,
817 ) -> ParseResult<()> {
818 self.font_cache.clear();
820
821 if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
824 if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
825 {
826 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
827 for (font_name, font_obj) in font_dict.0.iter() {
828 if let Some(font_ref) = font_obj.as_reference() {
829 self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
830 }
831 }
832 }
833 }
834 } else if let Some(resources) = page.get_resources() {
835 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
837 for (font_name, font_obj) in font_dict.0.iter() {
838 if let Some(font_ref) = font_obj.as_reference() {
839 self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
840 }
841 }
842 }
843 }
844
845 Ok(())
846 }
847
848 fn cache_font_by_ref<R: Read + Seek>(
850 &mut self,
851 font_name: &str,
852 font_ref: (u32, u16),
853 document: &PdfDocument<R>,
854 ) {
855 if let Some(cached) = self.font_object_cache.get(&font_ref) {
857 self.font_cache
858 .insert(font_name.to_string(), cached.clone());
859 tracing::debug!(
860 "Reused cached font object ({}, {}): {} (ToUnicode: {})",
861 font_ref.0,
862 font_ref.1,
863 font_name,
864 cached.to_unicode.is_some()
865 );
866 return;
867 }
868
869 if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
871 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
872 if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
873 let has_to_unicode = font_info.to_unicode.is_some();
874 self.font_object_cache.insert(font_ref, font_info.clone());
876 self.font_cache.insert(font_name.to_string(), font_info);
878 tracing::debug!(
879 "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
880 font_ref.0,
881 font_ref.1,
882 font_name,
883 has_to_unicode
884 );
885 }
886 }
887 }
888
889 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
891 use crate::text::encoding::TextEncoding;
892
893 if let Some(ref font_name) = state.font_name {
895 if let Some(font_info) = self.font_cache.get(font_name) {
896 if let Ok(decoded) =
898 crate::text::extraction_cmap::decode_text_with_font(text, font_info)
899 {
900 if !decoded.trim().is_empty()
902 && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
903 {
904 let sanitized = sanitize_extracted_text(&decoded);
906 tracing::debug!(
907 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
908 font_name,
909 text,
910 sanitized
911 );
912 return Ok(sanitized);
913 }
914 }
915
916 tracing::debug!(
917 "CMap decoding failed or produced garbage for font {}, falling back to encoding",
918 font_name
919 );
920 }
921 }
922
923 let encoding = if let Some(ref font_name) = state.font_name {
925 match font_name.to_lowercase().as_str() {
926 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
927 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
928 name if name.contains("standard") => TextEncoding::StandardEncoding,
929 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
930 _ => {
931 if font_name.starts_with("Times")
933 || font_name.starts_with("Helvetica")
934 || font_name.starts_with("Courier")
935 {
936 TextEncoding::WinAnsiEncoding } else {
938 TextEncoding::PdfDocEncoding }
940 }
941 }
942 } else {
943 TextEncoding::WinAnsiEncoding };
945
946 let fallback_result = encoding.decode(text);
947 let sanitized = sanitize_extracted_text(&fallback_result);
949 tracing::debug!(
950 "Fallback encoding decoding: {:?} -> \"{}\"",
951 text,
952 sanitized
953 );
954 Ok(sanitized)
955 }
956}
957
958impl Default for TextExtractor {
959 fn default() -> Self {
960 Self::new()
961 }
962}
963
964fn emit_text_fragment(
974 fragments: &mut Vec<TextFragment>,
975 decoded: &str,
976 text_width: f64,
977 x: f64,
978 y: f64,
979 state: &TextState,
980) {
981 if decoded.is_empty() {
982 return;
983 }
984 let (is_bold, is_italic) = state
985 .font_name
986 .as_ref()
987 .map(|name| parse_font_style(name))
988 .unwrap_or((false, false));
989 fragments.push(TextFragment {
990 text: decoded.to_owned(),
991 x,
992 y,
993 width: text_width,
994 height: state.font_size,
995 font_size: state.font_size,
996 font_name: state.font_name.clone(),
997 is_bold,
998 is_italic,
999 color: state.fill_color,
1000 space_decisions: Vec::new(),
1001 });
1002}
1003
1004fn text_origin(state: &TextState) -> (f64, f64) {
1009 let combined = multiply_matrix(&state.ctm, &state.text_matrix);
1010 transform_point(0.0, 0.0, &combined)
1011}
1012
1013fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
1015 [
1016 a[0] * b[0] + a[1] * b[2],
1017 a[0] * b[1] + a[1] * b[3],
1018 a[2] * b[0] + a[3] * b[2],
1019 a[2] * b[1] + a[3] * b[3],
1020 a[4] * b[0] + a[5] * b[2] + b[4],
1021 a[4] * b[1] + a[5] * b[3] + b[5],
1022 ]
1023}
1024
1025fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
1027 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
1028 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
1029 (tx, ty)
1030}
1031
1032fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
1034 if let Some(font) = font_info {
1036 if let Some(ref widths) = font.metrics.widths {
1037 let first_char = font.metrics.first_char.unwrap_or(0);
1038 let last_char = font.metrics.last_char.unwrap_or(255);
1039 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
1040
1041 let mut total_width = 0.0;
1042 let mut chars = text.chars().peekable();
1043
1044 while let Some(ch) = chars.next() {
1045 let char_code = ch as u32;
1046
1047 let width = if char_code >= first_char && char_code <= last_char {
1049 let index = (char_code - first_char) as usize;
1050 widths.get(index).copied().unwrap_or(missing_width)
1051 } else {
1052 missing_width
1053 };
1054
1055 total_width += width / 1000.0 * font_size;
1057
1058 if let Some(ref kerning) = font.metrics.kerning {
1060 if let Some(&next_ch) = chars.peek() {
1061 let next_char = next_ch as u32;
1062 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
1063 total_width += kern_value / 1000.0 * font_size;
1065 }
1066 }
1067 }
1068 }
1069
1070 return total_width;
1071 }
1072 }
1073
1074 text.len() as f64 * font_size * 0.5
1076}
1077
1078pub fn sanitize_extracted_text(text: &str) -> String {
1111 if text.is_empty() {
1112 return String::new();
1113 }
1114
1115 let mut result = String::with_capacity(text.len());
1117 let mut chars = text.chars().peekable();
1118 let mut last_was_space = false;
1119
1120 while let Some(ch) = chars.next() {
1121 match ch {
1122 '\0' => {
1124 if chars.peek() == Some(&'\u{3}') {
1126 chars.next(); }
1128 if !last_was_space {
1130 result.push(' ');
1131 last_was_space = true;
1132 }
1133 }
1134
1135 '\u{3}' => {
1137 }
1139
1140 '\t' | '\n' | '\r' => {
1142 result.push(ch);
1143 last_was_space = ch == '\t';
1145 }
1146
1147 ' ' => {
1149 if !last_was_space {
1150 result.push(' ');
1151 last_was_space = true;
1152 }
1153 }
1154
1155 c if c.is_ascii_control() => {
1157 }
1159
1160 _ => {
1162 result.push(ch);
1163 last_was_space = false;
1164 }
1165 }
1166 }
1167
1168 result
1169}
1170
1171#[cfg(test)]
1172mod tests {
1173 use super::*;
1174
1175 #[test]
1176 fn test_matrix_multiplication() {
1177 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1178 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1179
1180 let result = multiply_matrix(&identity, &translation);
1181 assert_eq!(result, translation);
1182
1183 let result2 = multiply_matrix(&translation, &identity);
1184 assert_eq!(result2, translation);
1185 }
1186
1187 #[test]
1188 fn test_transform_point() {
1189 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1190 let (x, y) = transform_point(5.0, 5.0, &translation);
1191 assert_eq!(x, 15.0);
1192 assert_eq!(y, 25.0);
1193 }
1194
1195 #[test]
1196 fn test_extraction_options_default() {
1197 let options = ExtractionOptions::default();
1198 assert!(!options.preserve_layout);
1199 assert_eq!(options.space_threshold, 0.3);
1200 assert_eq!(options.newline_threshold, 10.0);
1201 assert!(options.sort_by_position);
1202 assert!(!options.detect_columns);
1203 assert_eq!(options.column_threshold, 50.0);
1204 assert!(options.merge_hyphenated);
1205 }
1206
1207 #[test]
1208 fn test_extraction_options_custom() {
1209 let options = ExtractionOptions {
1210 preserve_layout: true,
1211 space_threshold: 0.5,
1212 newline_threshold: 15.0,
1213 sort_by_position: false,
1214 detect_columns: true,
1215 column_threshold: 75.0,
1216 merge_hyphenated: false,
1217 track_space_decisions: false,
1218 };
1219 assert!(options.preserve_layout);
1220 assert_eq!(options.space_threshold, 0.5);
1221 assert_eq!(options.newline_threshold, 15.0);
1222 assert!(!options.sort_by_position);
1223 assert!(options.detect_columns);
1224 assert_eq!(options.column_threshold, 75.0);
1225 assert!(!options.merge_hyphenated);
1226 }
1227
1228 #[test]
1229 fn test_parse_font_style_bold() {
1230 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1232 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1233
1234 assert_eq!(parse_font_style("Arial Bold"), (true, false));
1236 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1237
1238 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1240 }
1241
1242 #[test]
1243 fn test_parse_font_style_italic() {
1244 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1246 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1247
1248 assert_eq!(parse_font_style("Arial Italic"), (false, true));
1250 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1251
1252 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1254 }
1255
1256 #[test]
1257 fn test_parse_font_style_bold_italic() {
1258 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1259 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1260 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1261 }
1262
1263 #[test]
1264 fn test_parse_font_style_regular() {
1265 assert_eq!(parse_font_style("Helvetica"), (false, false));
1266 assert_eq!(parse_font_style("Times-Roman"), (false, false));
1267 assert_eq!(parse_font_style("Courier"), (false, false));
1268 assert_eq!(parse_font_style("Arial"), (false, false));
1269 }
1270
1271 #[test]
1272 fn test_parse_font_style_edge_cases() {
1273 assert_eq!(parse_font_style(""), (false, false));
1275 assert_eq!(parse_font_style("UnknownFont"), (false, false));
1276
1277 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1279 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1280 }
1281
1282 #[test]
1283 fn test_text_fragment() {
1284 let fragment = TextFragment {
1285 text: "Hello".to_string(),
1286 x: 100.0,
1287 y: 200.0,
1288 width: 50.0,
1289 height: 12.0,
1290 font_size: 10.0,
1291 font_name: None,
1292 is_bold: false,
1293 is_italic: false,
1294 color: None,
1295 space_decisions: Vec::new(),
1296 };
1297 assert_eq!(fragment.text, "Hello");
1298 assert_eq!(fragment.x, 100.0);
1299 assert_eq!(fragment.y, 200.0);
1300 assert_eq!(fragment.width, 50.0);
1301 assert_eq!(fragment.height, 12.0);
1302 assert_eq!(fragment.font_size, 10.0);
1303 }
1304
1305 #[test]
1306 fn test_extracted_text() {
1307 let fragments = vec![
1308 TextFragment {
1309 text: "Hello".to_string(),
1310 x: 100.0,
1311 y: 200.0,
1312 width: 50.0,
1313 height: 12.0,
1314 font_size: 10.0,
1315 font_name: None,
1316 is_bold: false,
1317 is_italic: false,
1318 color: None,
1319 space_decisions: Vec::new(),
1320 },
1321 TextFragment {
1322 text: "World".to_string(),
1323 x: 160.0,
1324 y: 200.0,
1325 width: 50.0,
1326 height: 12.0,
1327 font_size: 10.0,
1328 font_name: None,
1329 is_bold: false,
1330 is_italic: false,
1331 color: None,
1332 space_decisions: Vec::new(),
1333 },
1334 ];
1335
1336 let extracted = ExtractedText {
1337 text: "Hello World".to_string(),
1338 fragments: fragments,
1339 };
1340
1341 assert_eq!(extracted.text, "Hello World");
1342 assert_eq!(extracted.fragments.len(), 2);
1343 assert_eq!(extracted.fragments[0].text, "Hello");
1344 assert_eq!(extracted.fragments[1].text, "World");
1345 }
1346
1347 #[test]
1348 fn test_text_state_default() {
1349 let state = TextState::default();
1350 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1351 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1352 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1353 assert_eq!(state.leading, 0.0);
1354 assert_eq!(state.char_space, 0.0);
1355 assert_eq!(state.word_space, 0.0);
1356 assert_eq!(state.horizontal_scale, 100.0);
1357 assert_eq!(state.text_rise, 0.0);
1358 assert_eq!(state.font_size, 0.0);
1359 assert!(state.font_name.is_none());
1360 assert_eq!(state.render_mode, 0);
1361 }
1362
1363 #[test]
1364 fn test_matrix_operations() {
1365 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
1368 assert_eq!(x, 0.0);
1369 assert_eq!(y, 1.0);
1370
1371 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1373 let (x, y) = transform_point(5.0, 5.0, &scale);
1374 assert_eq!(x, 10.0);
1375 assert_eq!(y, 15.0);
1376
1377 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1379 let (x, y) = transform_point(1.0, 1.0, &complex);
1380 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
1383
1384 #[test]
1385 fn test_text_extractor_new() {
1386 let extractor = TextExtractor::new();
1387 let options = extractor.options;
1388 assert!(!options.preserve_layout);
1389 assert_eq!(options.space_threshold, 0.3);
1390 assert_eq!(options.newline_threshold, 10.0);
1391 assert!(options.sort_by_position);
1392 assert!(!options.detect_columns);
1393 assert_eq!(options.column_threshold, 50.0);
1394 assert!(options.merge_hyphenated);
1395 }
1396
1397 #[test]
1398 fn test_text_extractor_with_options() {
1399 let options = ExtractionOptions {
1400 preserve_layout: true,
1401 space_threshold: 0.3,
1402 newline_threshold: 12.0,
1403 sort_by_position: false,
1404 detect_columns: true,
1405 column_threshold: 60.0,
1406 merge_hyphenated: false,
1407 track_space_decisions: false,
1408 };
1409 let extractor = TextExtractor::with_options(options.clone());
1410 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1411 assert_eq!(extractor.options.space_threshold, options.space_threshold);
1412 assert_eq!(
1413 extractor.options.newline_threshold,
1414 options.newline_threshold
1415 );
1416 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1417 assert_eq!(extractor.options.detect_columns, options.detect_columns);
1418 assert_eq!(extractor.options.column_threshold, options.column_threshold);
1419 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1420 }
1421
1422 #[test]
1427 fn test_calculate_text_width_with_no_font_info() {
1428 let width = calculate_text_width("Hello", 12.0, None);
1430
1431 assert_eq!(
1433 width, 30.0,
1434 "Without font info, should use simplified calculation: len * font_size * 0.5"
1435 );
1436 }
1437
1438 #[test]
1439 fn test_calculate_text_width_with_empty_metrics() {
1440 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1441
1442 let font_info = FontInfo {
1444 name: "TestFont".to_string(),
1445 font_type: "Type1".to_string(),
1446 encoding: None,
1447 to_unicode: None,
1448 differences: None,
1449 descendant_font: None,
1450 cid_to_gid_map: None,
1451 cid_ordering: None,
1452 metrics: FontMetrics {
1453 first_char: None,
1454 last_char: None,
1455 widths: None,
1456 missing_width: Some(500.0),
1457 kerning: None,
1458 },
1459 };
1460
1461 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1462
1463 assert_eq!(
1465 width, 30.0,
1466 "Without widths array, should fall back to simplified calculation"
1467 );
1468 }
1469
1470 #[test]
1471 fn test_calculate_text_width_with_complete_metrics() {
1472 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1473
1474 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
1485 name: "Helvetica".to_string(),
1486 font_type: "Type1".to_string(),
1487 encoding: None,
1488 to_unicode: None,
1489 differences: None,
1490 descendant_font: None,
1491 cid_to_gid_map: None,
1492 cid_ordering: None,
1493 metrics: FontMetrics {
1494 first_char: Some(32),
1495 last_char: Some(126),
1496 widths: Some(widths),
1497 missing_width: Some(500.0),
1498 kerning: None,
1499 },
1500 };
1501
1502 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1503
1504 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1512 let tolerance = 0.0001; assert!(
1514 (width - expected).abs() < tolerance,
1515 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1516 expected,
1517 width,
1518 (width - expected).abs()
1519 );
1520
1521 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
1524 width, simplified,
1525 "Metrics-based calculation should differ from simplified (30.0)"
1526 );
1527 }
1528
1529 #[test]
1530 fn test_calculate_text_width_character_outside_range() {
1531 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1532
1533 let widths = vec![722.0; 26]; let font_info = FontInfo {
1537 name: "TestFont".to_string(),
1538 font_type: "Type1".to_string(),
1539 encoding: None,
1540 to_unicode: None,
1541 differences: None,
1542 descendant_font: None,
1543 cid_to_gid_map: None,
1544 cid_ordering: None,
1545 metrics: FontMetrics {
1546 first_char: Some(65), last_char: Some(90), widths: Some(widths),
1549 missing_width: Some(500.0),
1550 kerning: None,
1551 },
1552 };
1553
1554 let width = calculate_text_width("A1", 10.0, Some(&font_info));
1556
1557 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1562 assert_eq!(
1563 width, expected,
1564 "Should use missing_width for characters outside range"
1565 );
1566 }
1567
1568 #[test]
1569 fn test_calculate_text_width_missing_width_in_array() {
1570 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1571
1572 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1577 name: "TestFont".to_string(),
1578 font_type: "Type1".to_string(),
1579 encoding: None,
1580 to_unicode: None,
1581 differences: None,
1582 descendant_font: None,
1583 cid_to_gid_map: None,
1584 cid_ordering: None,
1585 metrics: FontMetrics {
1586 first_char: Some(32),
1587 last_char: Some(126),
1588 widths: Some(widths),
1589 missing_width: Some(600.0),
1590 kerning: None,
1591 },
1592 };
1593
1594 let char_code = 42u8 as char; let text = char_code.to_string();
1597 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1598
1599 assert_eq!(
1602 width, 0.0,
1603 "Should use 0.0 width from array, not missing_width"
1604 );
1605 }
1606
1607 #[test]
1608 fn test_calculate_text_width_empty_string() {
1609 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1610
1611 let font_info = FontInfo {
1612 name: "TestFont".to_string(),
1613 font_type: "Type1".to_string(),
1614 encoding: None,
1615 to_unicode: None,
1616 differences: None,
1617 descendant_font: None,
1618 cid_to_gid_map: None,
1619 cid_ordering: None,
1620 metrics: FontMetrics {
1621 first_char: Some(32),
1622 last_char: Some(126),
1623 widths: Some(vec![500.0; 95]),
1624 missing_width: Some(500.0),
1625 kerning: None,
1626 },
1627 };
1628
1629 let width = calculate_text_width("", 12.0, Some(&font_info));
1630 assert_eq!(width, 0.0, "Empty string should have zero width");
1631
1632 let width_no_font = calculate_text_width("", 12.0, None);
1634 assert_eq!(
1635 width_no_font, 0.0,
1636 "Empty string should have zero width (no font)"
1637 );
1638 }
1639
1640 #[test]
1641 fn test_calculate_text_width_unicode_characters() {
1642 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1643
1644 let font_info = FontInfo {
1646 name: "TestFont".to_string(),
1647 font_type: "Type1".to_string(),
1648 encoding: None,
1649 to_unicode: None,
1650 differences: None,
1651 descendant_font: None,
1652 cid_to_gid_map: None,
1653 cid_ordering: None,
1654 metrics: FontMetrics {
1655 first_char: Some(32),
1656 last_char: Some(126),
1657 widths: Some(vec![500.0; 95]),
1658 missing_width: Some(600.0),
1659 kerning: None,
1660 },
1661 };
1662
1663 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1665
1666 assert_eq!(
1669 width, 6.0,
1670 "Unicode character outside range should use missing_width"
1671 );
1672 }
1673
1674 #[test]
1675 fn test_calculate_text_width_different_font_sizes() {
1676 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1677
1678 let font_info = FontInfo {
1679 name: "TestFont".to_string(),
1680 font_type: "Type1".to_string(),
1681 encoding: None,
1682 to_unicode: None,
1683 differences: None,
1684 descendant_font: None,
1685 cid_to_gid_map: None,
1686 cid_ordering: None,
1687 metrics: FontMetrics {
1688 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1691 missing_width: Some(500.0),
1692 kerning: None,
1693 },
1694 };
1695
1696 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1698 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1699
1700 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1702 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1703 assert_eq!(
1704 width_20,
1705 width_10 * 2.0,
1706 "Width should scale linearly with font size"
1707 );
1708 }
1709
1710 #[test]
1711 fn test_calculate_text_width_proportional_vs_monospace() {
1712 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1713
1714 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1717 name: "Helvetica".to_string(),
1718 font_type: "Type1".to_string(),
1719 encoding: None,
1720 to_unicode: None,
1721 differences: None,
1722 descendant_font: None,
1723 cid_to_gid_map: None,
1724 cid_ordering: None,
1725 metrics: FontMetrics {
1726 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1729 missing_width: Some(500.0),
1730 kerning: None,
1731 },
1732 };
1733
1734 let monospace_widths = vec![600.0, 600.0, 600.0];
1736 let monospace_font = FontInfo {
1737 name: "Courier".to_string(),
1738 font_type: "Type1".to_string(),
1739 encoding: None,
1740 to_unicode: None,
1741 differences: None,
1742 descendant_font: None,
1743 cid_to_gid_map: None,
1744 cid_ordering: None,
1745 metrics: FontMetrics {
1746 first_char: Some(105),
1747 last_char: Some(107),
1748 widths: Some(monospace_widths),
1749 missing_width: Some(600.0),
1750 kerning: None,
1751 },
1752 };
1753
1754 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1755 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1756
1757 assert!(
1759 prop_width < mono_width,
1760 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1761 prop_width,
1762 mono_width
1763 );
1764 }
1765
1766 #[test]
1771 fn test_calculate_text_width_with_kerning() {
1772 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1773 use std::collections::HashMap;
1774
1775 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1782 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1787 name: "Helvetica".to_string(),
1788 font_type: "Type1".to_string(),
1789 encoding: None,
1790 to_unicode: None,
1791 differences: None,
1792 descendant_font: None,
1793 cid_to_gid_map: None,
1794 cid_ordering: None,
1795 metrics: FontMetrics {
1796 first_char: Some(32),
1797 last_char: Some(126),
1798 widths: Some(widths),
1799 missing_width: Some(500.0),
1800 kerning: Some(kerning),
1801 },
1802 };
1803
1804 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1806 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1809 let tolerance = 0.0001;
1810 assert!(
1811 (width_av - expected_av).abs() < tolerance,
1812 "AV with kerning: expected {}, got {}, diff {}",
1813 expected_av,
1814 width_av,
1815 (width_av - expected_av).abs()
1816 );
1817
1818 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1820 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1823 assert!(
1824 (width_aw - expected_aw).abs() < tolerance,
1825 "AW with kerning: expected {}, got {}, diff {}",
1826 expected_aw,
1827 width_aw,
1828 (width_aw - expected_aw).abs()
1829 );
1830
1831 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1833 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1835 assert!(
1836 (width_va - expected_va).abs() < tolerance,
1837 "VA without kerning: expected {}, got {}, diff {}",
1838 expected_va,
1839 width_va,
1840 (width_va - expected_va).abs()
1841 );
1842
1843 assert!(
1845 width_av < width_va,
1846 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1847 width_av,
1848 width_va
1849 );
1850 }
1851
1852 #[test]
1853 fn test_parse_truetype_kern_table_minimal() {
1854 use crate::text::extraction_cmap::parse_truetype_kern_table;
1855
1856 let mut ttf_data = vec![
1864 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1871
1872 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1886
1887 ttf_data.extend_from_slice(&[
1889 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1909
1910 let result = parse_truetype_kern_table(&ttf_data);
1911 assert!(
1912 result.is_ok(),
1913 "Should parse minimal kern table successfully: {:?}",
1914 result.err()
1915 );
1916
1917 let kerning_map = result.unwrap();
1918 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1919
1920 assert_eq!(
1922 kerning_map.get(&(65, 86)),
1923 Some(&-50.0),
1924 "Should have A+V kerning pair with value -50"
1925 );
1926
1927 assert_eq!(
1929 kerning_map.get(&(65, 87)),
1930 Some(&-40.0),
1931 "Should have A+W kerning pair with value -40"
1932 );
1933 }
1934
1935 #[test]
1936 fn test_parse_kern_table_no_kern_table() {
1937 use crate::text::extraction_cmap::extract_truetype_kerning;
1938
1939 let ttf_data = vec![
1944 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1957 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1958 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1959 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1960 ];
1961
1962 let result = extract_truetype_kerning(&ttf_data);
1963 assert!(
1964 result.is_ok(),
1965 "Should gracefully handle missing kern table"
1966 );
1967
1968 let kerning_map = result.unwrap();
1969 assert!(
1970 kerning_map.is_empty(),
1971 "Should return empty HashMap when no kern table exists"
1972 );
1973 }
1974}