1use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19 pub preserve_layout: bool,
21 pub space_threshold: f64,
23 pub newline_threshold: f64,
25 pub sort_by_position: bool,
27 pub detect_columns: bool,
29 pub column_threshold: f64,
31 pub merge_hyphenated: bool,
33 pub track_space_decisions: bool,
36}
37
38impl Default for ExtractionOptions {
39 fn default() -> Self {
40 Self {
41 preserve_layout: false,
42 space_threshold: 0.3,
43 newline_threshold: 10.0,
44 sort_by_position: true,
45 detect_columns: false,
46 column_threshold: 50.0,
47 merge_hyphenated: true,
48 track_space_decisions: false,
49 }
50 }
51}
52
53#[derive(Debug, Clone)]
55pub struct ExtractedText {
56 pub text: String,
58 pub fragments: Vec<TextFragment>,
60}
61
62#[derive(Debug, Clone)]
65pub struct SpaceDecision {
66 pub offset: usize,
68 pub dx: f64,
70 pub threshold: f64,
72 pub confidence: f64,
74 pub inserted: bool,
76}
77
78#[derive(Debug, Clone)]
80pub struct TextFragment {
81 pub text: String,
83 pub x: f64,
85 pub y: f64,
87 pub width: f64,
89 pub height: f64,
91 pub font_size: f64,
93 pub font_name: Option<String>,
95 pub is_bold: bool,
97 pub is_italic: bool,
99 pub color: Option<Color>,
101 pub space_decisions: Vec<SpaceDecision>,
103}
104
105struct TextState {
107 text_matrix: [f64; 6],
109 text_line_matrix: [f64; 6],
111 ctm: [f64; 6],
113 leading: f64,
115 char_space: f64,
117 word_space: f64,
119 horizontal_scale: f64,
121 text_rise: f64,
123 font_size: f64,
125 font_name: Option<String>,
127 render_mode: u8,
129 fill_color: Option<Color>,
131}
132
133impl Default for TextState {
134 fn default() -> Self {
135 Self {
136 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
137 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
138 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139 leading: 0.0,
140 char_space: 0.0,
141 word_space: 0.0,
142 horizontal_scale: 100.0,
143 text_rise: 0.0,
144 font_size: 0.0,
145 font_name: None,
146 render_mode: 0,
147 fill_color: None,
148 }
149 }
150}
151
152pub fn parse_font_style(font_name: &str) -> (bool, bool) {
173 let name_lower = font_name.to_lowercase();
174
175 let is_bold = name_lower.contains("bold")
177 || name_lower.contains("-b")
178 || name_lower.contains(" b ")
179 || name_lower.ends_with(" b");
180
181 let is_italic = name_lower.contains("italic")
183 || name_lower.contains("oblique")
184 || name_lower.contains("-i")
185 || name_lower.contains(" i ")
186 || name_lower.ends_with(" i");
187
188 (is_bold, is_italic)
189}
190
191pub struct TextExtractor {
193 options: ExtractionOptions,
194 font_cache: HashMap<String, FontInfo>,
196 font_object_cache: HashMap<(u32, u16), FontInfo>,
199}
200
201impl TextExtractor {
202 pub fn new() -> Self {
204 Self {
205 options: ExtractionOptions::default(),
206 font_cache: HashMap::new(),
207 font_object_cache: HashMap::new(),
208 }
209 }
210
211 pub fn with_options(options: ExtractionOptions) -> Self {
213 Self {
214 options,
215 font_cache: HashMap::new(),
216 font_object_cache: HashMap::new(),
217 }
218 }
219
220 pub fn extract_from_document<R: Read + Seek>(
222 &mut self,
223 document: &PdfDocument<R>,
224 ) -> ParseResult<Vec<ExtractedText>> {
225 let page_count = document.page_count()?;
226 let mut results = Vec::new();
227
228 for i in 0..page_count {
229 let text = self.extract_from_page(document, i)?;
230 results.push(text);
231 }
232
233 Ok(results)
234 }
235
236 pub fn extract_from_page<R: Read + Seek>(
238 &mut self,
239 document: &PdfDocument<R>,
240 page_index: u32,
241 ) -> ParseResult<ExtractedText> {
242 let page = document.get_page(page_index)?;
244
245 {
247 let _span = tracing::info_span!("font_resources").entered();
248 self.extract_font_resources(&page, document)?;
249 }
250
251 let streams = {
253 let _span = tracing::info_span!("stream_decompress").entered();
254 page.content_streams_with_document(document)?
255 };
256
257 let mut extracted_text = String::new();
258 let mut fragments = Vec::new();
259 let mut state = TextState::default();
260 let mut in_text_object = false;
261 let mut last_x = 0.0;
262 let mut last_y = 0.0;
263
264 for (stream_idx, stream_data) in streams.iter().enumerate() {
266 let operations = match {
267 let _span = tracing::info_span!("content_parse").entered();
268 ContentParser::parse_content(stream_data)
269 } {
270 Ok(ops) => ops,
271 Err(e) => {
272 tracing::debug!(
274 "Warning: Failed to parse content stream on page {}, stream {}/{}",
275 page_index + 1,
276 stream_idx + 1,
277 streams.len()
278 );
279 tracing::debug!(" Error: {}", e);
280 tracing::debug!(" Stream size: {} bytes", stream_data.len());
281
282 let preview_len = stream_data.len().min(100);
284 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
285 tracing::debug!(
286 " Stream preview (first {} bytes): {:?}",
287 preview_len,
288 preview.chars().take(80).collect::<String>()
289 );
290
291 continue;
293 }
294 };
295
296 let _ops_span = tracing::info_span!("text_ops_loop").entered();
297 for op in operations {
298 match op {
299 ContentOperation::BeginText => {
300 in_text_object = true;
301 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
303 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
304 }
305
306 ContentOperation::EndText => {
307 in_text_object = false;
308 }
309
310 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
311 state.text_matrix =
312 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
313 state.text_line_matrix =
314 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
315 }
316
317 ContentOperation::MoveText(tx, ty) => {
318 let new_matrix = multiply_matrix(
320 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
321 &state.text_line_matrix,
322 );
323 state.text_matrix = new_matrix;
324 state.text_line_matrix = new_matrix;
325 }
326
327 ContentOperation::NextLine => {
328 let new_matrix = multiply_matrix(
330 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
331 &state.text_line_matrix,
332 );
333 state.text_matrix = new_matrix;
334 state.text_line_matrix = new_matrix;
335 }
336
337 ContentOperation::ShowText(text) => {
338 if in_text_object {
339 let text_bytes = &text;
340 let decoded = self.decode_text(text_bytes, &state)?;
341
342 let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
345 let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
346
347 if !extracted_text.is_empty() {
349 let dx = x - last_x;
350 let dy = (y - last_y).abs();
351
352 if dy > self.options.newline_threshold {
353 extracted_text.push('\n');
354 } else if dx > self.options.space_threshold * state.font_size {
355 extracted_text.push(' ');
356 }
357 }
358
359 extracted_text.push_str(&decoded);
360
361 let font_info = state
363 .font_name
364 .as_ref()
365 .and_then(|name| self.font_cache.get(name));
366
367 let text_width =
369 calculate_text_width(&decoded, state.font_size, font_info);
370
371 if self.options.preserve_layout {
372 let (is_bold, is_italic) = state
374 .font_name
375 .as_ref()
376 .map(|name| parse_font_style(name))
377 .unwrap_or((false, false));
378
379 fragments.push(TextFragment {
380 text: decoded.clone(),
381 x,
382 y,
383 width: text_width,
384 height: state.font_size,
385 font_size: state.font_size,
386 font_name: state.font_name.clone(),
387 is_bold,
388 is_italic,
389 color: state.fill_color,
390 space_decisions: Vec::new(),
391 });
392 }
393
394 last_x = x + text_width;
396 last_y = y;
397
398 let tx = text_width * state.horizontal_scale / 100.0;
400 state.text_matrix =
401 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
402 }
403 }
404
405 ContentOperation::ShowTextArray(array) => {
406 if in_text_object {
407 let font_info = state
409 .font_name
410 .as_ref()
411 .and_then(|name| self.font_cache.get(name));
412
413 for item in array {
414 match item {
415 TextElement::Text(text_bytes) => {
416 let decoded = self.decode_text(&text_bytes, &state)?;
417 extracted_text.push_str(&decoded);
418
419 let text_width = calculate_text_width(
421 &decoded,
422 state.font_size,
423 font_info,
424 );
425 let tx = text_width * state.horizontal_scale / 100.0;
426 state.text_matrix = multiply_matrix(
427 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
428 &state.text_matrix,
429 );
430 }
431 TextElement::Spacing(adjustment) => {
432 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
434 state.text_matrix = multiply_matrix(
435 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
436 &state.text_matrix,
437 );
438 }
439 }
440 }
441 }
442 }
443
444 ContentOperation::SetFont(name, size) => {
445 state.font_name = Some(name);
446 state.font_size = size as f64;
447 }
448
449 ContentOperation::SetLeading(leading) => {
450 state.leading = leading as f64;
451 }
452
453 ContentOperation::SetCharSpacing(spacing) => {
454 state.char_space = spacing as f64;
455 }
456
457 ContentOperation::SetWordSpacing(spacing) => {
458 state.word_space = spacing as f64;
459 }
460
461 ContentOperation::SetHorizontalScaling(scale) => {
462 state.horizontal_scale = scale as f64;
463 }
464
465 ContentOperation::SetTextRise(rise) => {
466 state.text_rise = rise as f64;
467 }
468
469 ContentOperation::SetTextRenderMode(mode) => {
470 state.render_mode = mode as u8;
471 }
472
473 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
474 let [a0, b0, c0, d0, e0, f0] = state.ctm;
476 let a = a as f64;
477 let b = b as f64;
478 let c = c as f64;
479 let d = d as f64;
480 let e = e as f64;
481 let f = f as f64;
482 state.ctm = [
483 a * a0 + b * c0,
484 a * b0 + b * d0,
485 c * a0 + d * c0,
486 c * b0 + d * d0,
487 e * a0 + f * c0 + e0,
488 e * b0 + f * d0 + f0,
489 ];
490 }
491
492 ContentOperation::SetNonStrokingGray(gray) => {
494 state.fill_color = Some(Color::gray(gray as f64));
495 }
496
497 ContentOperation::SetNonStrokingRGB(r, g, b) => {
498 state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
499 }
500
501 ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
502 state.fill_color =
503 Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
504 }
505
506 _ => {
507 }
509 }
510 }
511 }
512
513 {
514 let _span = tracing::info_span!("layout_finalize").entered();
515
516 if self.options.sort_by_position && !fragments.is_empty() {
518 self.sort_and_merge_fragments(&mut fragments);
519 }
520
521 if self.options.preserve_layout && !fragments.is_empty() {
524 fragments = self.merge_close_fragments(&fragments);
525 }
526
527 if self.options.preserve_layout && !fragments.is_empty() {
529 extracted_text = self.reconstruct_text_from_fragments(&fragments);
530 }
531 }
532
533 Ok(ExtractedText {
534 text: extracted_text,
535 fragments,
536 })
537 }
538
539 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
541 let threshold = self.options.newline_threshold;
549 fragments.sort_by(|a, b| {
550 let band_a = if threshold > 0.0 {
552 (-a.y / threshold).round()
553 } else {
554 -a.y
555 };
556 let band_b = if threshold > 0.0 {
557 (-b.y / threshold).round()
558 } else {
559 -b.y
560 };
561
562 band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
564 });
565
566 if self.options.detect_columns {
568 self.detect_and_sort_columns(fragments);
569 }
570 }
571
572 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
574 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
576 let mut current_line: Vec<&mut TextFragment> = Vec::new();
577 let mut last_y = f64::INFINITY;
578
579 for fragment in fragments.iter_mut() {
580 let fragment_y = fragment.y;
581 if (last_y - fragment_y).abs() > self.options.newline_threshold
582 && !current_line.is_empty()
583 {
584 lines.push(current_line);
585 current_line = Vec::new();
586 }
587 current_line.push(fragment);
588 last_y = fragment_y;
589 }
590 if !current_line.is_empty() {
591 lines.push(current_line);
592 }
593
594 let mut column_boundaries = vec![0.0];
596 for line in &lines {
597 if line.len() > 1 {
598 for i in 0..line.len() - 1 {
599 let gap = line[i + 1].x - (line[i].x + line[i].width);
600 if gap > self.options.column_threshold {
601 let boundary = line[i].x + line[i].width + gap / 2.0;
602 if !column_boundaries
603 .iter()
604 .any(|&b| (b - boundary).abs() < 10.0)
605 {
606 column_boundaries.push(boundary);
607 }
608 }
609 }
610 }
611 }
612 column_boundaries.sort_by(|a, b| a.total_cmp(b));
613
614 if column_boundaries.len() > 1 {
616 fragments.sort_by(|a, b| {
617 let col_a = column_boundaries
619 .iter()
620 .position(|&boundary| a.x < boundary)
621 .unwrap_or(column_boundaries.len())
622 - 1;
623 let col_b = column_boundaries
624 .iter()
625 .position(|&boundary| b.x < boundary)
626 .unwrap_or(column_boundaries.len())
627 - 1;
628
629 if col_a != col_b {
630 col_a.cmp(&col_b)
631 } else {
632 b.y.total_cmp(&a.y)
634 }
635 });
636 }
637 }
638
639 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
641 let merged_fragments = self.merge_close_fragments(fragments);
643
644 let mut result = String::new();
645 let mut last_y = f64::INFINITY;
646 let mut last_x = 0.0;
647 let mut last_line_ended_with_hyphen = false;
648
649 for fragment in &merged_fragments {
650 let y_diff = (last_y - fragment.y).abs();
652 if !result.is_empty() && y_diff > self.options.newline_threshold {
653 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
655 if result.ends_with('-') {
657 result.pop();
658 }
659 } else {
660 result.push('\n');
661 }
662 } else if !result.is_empty() {
663 let x_gap = fragment.x - last_x;
665 if x_gap > self.options.space_threshold * fragment.font_size {
666 result.push(' ');
667 }
668 }
669
670 result.push_str(&fragment.text);
671 last_line_ended_with_hyphen = fragment.text.ends_with('-');
672 last_y = fragment.y;
673 last_x = fragment.x + fragment.width;
674 }
675
676 result
677 }
678
679 fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
682 if fragments.is_empty() {
683 return Vec::new();
684 }
685
686 let mut merged = Vec::new();
687 let mut current = fragments[0].clone();
688
689 for fragment in &fragments[1..] {
690 let y_diff = (current.y - fragment.y).abs();
692 let x_gap = fragment.x - (current.x + current.width);
693
694 let should_merge = y_diff < 1.0 && x_gap >= 0.0 && x_gap < fragment.font_size * 0.5; if should_merge {
701 current.text.push_str(&fragment.text);
703 current.width = (fragment.x + fragment.width) - current.x;
704 } else {
705 merged.push(current);
707 current = fragment.clone();
708 }
709 }
710
711 merged.push(current);
712 merged
713 }
714
715 fn extract_font_resources<R: Read + Seek>(
721 &mut self,
722 page: &ParsedPage,
723 document: &PdfDocument<R>,
724 ) -> ParseResult<()> {
725 self.font_cache.clear();
727
728 if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
731 if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
732 {
733 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
734 for (font_name, font_obj) in font_dict.0.iter() {
735 if let Some(font_ref) = font_obj.as_reference() {
736 self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
737 }
738 }
739 }
740 }
741 } else if let Some(resources) = page.get_resources() {
742 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
744 for (font_name, font_obj) in font_dict.0.iter() {
745 if let Some(font_ref) = font_obj.as_reference() {
746 self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
747 }
748 }
749 }
750 }
751
752 Ok(())
753 }
754
755 fn cache_font_by_ref<R: Read + Seek>(
757 &mut self,
758 font_name: &str,
759 font_ref: (u32, u16),
760 document: &PdfDocument<R>,
761 ) {
762 if let Some(cached) = self.font_object_cache.get(&font_ref) {
764 self.font_cache
765 .insert(font_name.to_string(), cached.clone());
766 tracing::debug!(
767 "Reused cached font object ({}, {}): {} (ToUnicode: {})",
768 font_ref.0,
769 font_ref.1,
770 font_name,
771 cached.to_unicode.is_some()
772 );
773 return;
774 }
775
776 if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
778 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
779 if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
780 let has_to_unicode = font_info.to_unicode.is_some();
781 self.font_object_cache.insert(font_ref, font_info.clone());
783 self.font_cache.insert(font_name.to_string(), font_info);
785 tracing::debug!(
786 "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
787 font_ref.0,
788 font_ref.1,
789 font_name,
790 has_to_unicode
791 );
792 }
793 }
794 }
795
796 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
798 use crate::text::encoding::TextEncoding;
799
800 if let Some(ref font_name) = state.font_name {
802 if let Some(font_info) = self.font_cache.get(font_name) {
803 if let Ok(decoded) =
805 crate::text::extraction_cmap::decode_text_with_font(text, font_info)
806 {
807 if !decoded.trim().is_empty()
809 && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
810 {
811 let sanitized = sanitize_extracted_text(&decoded);
813 tracing::debug!(
814 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
815 font_name,
816 text,
817 sanitized
818 );
819 return Ok(sanitized);
820 }
821 }
822
823 tracing::debug!(
824 "CMap decoding failed or produced garbage for font {}, falling back to encoding",
825 font_name
826 );
827 }
828 }
829
830 let encoding = if let Some(ref font_name) = state.font_name {
832 match font_name.to_lowercase().as_str() {
833 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
834 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
835 name if name.contains("standard") => TextEncoding::StandardEncoding,
836 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
837 _ => {
838 if font_name.starts_with("Times")
840 || font_name.starts_with("Helvetica")
841 || font_name.starts_with("Courier")
842 {
843 TextEncoding::WinAnsiEncoding } else {
845 TextEncoding::PdfDocEncoding }
847 }
848 }
849 } else {
850 TextEncoding::WinAnsiEncoding };
852
853 let fallback_result = encoding.decode(text);
854 let sanitized = sanitize_extracted_text(&fallback_result);
856 tracing::debug!(
857 "Fallback encoding decoding: {:?} -> \"{}\"",
858 text,
859 sanitized
860 );
861 Ok(sanitized)
862 }
863}
864
865impl Default for TextExtractor {
866 fn default() -> Self {
867 Self::new()
868 }
869}
870
871fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
873 [
874 a[0] * b[0] + a[1] * b[2],
875 a[0] * b[1] + a[1] * b[3],
876 a[2] * b[0] + a[3] * b[2],
877 a[2] * b[1] + a[3] * b[3],
878 a[4] * b[0] + a[5] * b[2] + b[4],
879 a[4] * b[1] + a[5] * b[3] + b[5],
880 ]
881}
882
883fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
885 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
886 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
887 (tx, ty)
888}
889
890fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
892 if let Some(font) = font_info {
894 if let Some(ref widths) = font.metrics.widths {
895 let first_char = font.metrics.first_char.unwrap_or(0);
896 let last_char = font.metrics.last_char.unwrap_or(255);
897 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
898
899 let mut total_width = 0.0;
900 let mut chars = text.chars().peekable();
901
902 while let Some(ch) = chars.next() {
903 let char_code = ch as u32;
904
905 let width = if char_code >= first_char && char_code <= last_char {
907 let index = (char_code - first_char) as usize;
908 widths.get(index).copied().unwrap_or(missing_width)
909 } else {
910 missing_width
911 };
912
913 total_width += width / 1000.0 * font_size;
915
916 if let Some(ref kerning) = font.metrics.kerning {
918 if let Some(&next_ch) = chars.peek() {
919 let next_char = next_ch as u32;
920 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
921 total_width += kern_value / 1000.0 * font_size;
923 }
924 }
925 }
926 }
927
928 return total_width;
929 }
930 }
931
932 text.len() as f64 * font_size * 0.5
934}
935
936pub fn sanitize_extracted_text(text: &str) -> String {
969 if text.is_empty() {
970 return String::new();
971 }
972
973 let mut result = String::with_capacity(text.len());
975 let mut chars = text.chars().peekable();
976 let mut last_was_space = false;
977
978 while let Some(ch) = chars.next() {
979 match ch {
980 '\0' => {
982 if chars.peek() == Some(&'\u{3}') {
984 chars.next(); }
986 if !last_was_space {
988 result.push(' ');
989 last_was_space = true;
990 }
991 }
992
993 '\u{3}' => {
995 }
997
998 '\t' | '\n' | '\r' => {
1000 result.push(ch);
1001 last_was_space = ch == '\t';
1003 }
1004
1005 ' ' => {
1007 if !last_was_space {
1008 result.push(' ');
1009 last_was_space = true;
1010 }
1011 }
1012
1013 c if c.is_ascii_control() => {
1015 }
1017
1018 _ => {
1020 result.push(ch);
1021 last_was_space = false;
1022 }
1023 }
1024 }
1025
1026 result
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031 use super::*;
1032
1033 #[test]
1034 fn test_matrix_multiplication() {
1035 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1036 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1037
1038 let result = multiply_matrix(&identity, &translation);
1039 assert_eq!(result, translation);
1040
1041 let result2 = multiply_matrix(&translation, &identity);
1042 assert_eq!(result2, translation);
1043 }
1044
1045 #[test]
1046 fn test_transform_point() {
1047 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1048 let (x, y) = transform_point(5.0, 5.0, &translation);
1049 assert_eq!(x, 15.0);
1050 assert_eq!(y, 25.0);
1051 }
1052
1053 #[test]
1054 fn test_extraction_options_default() {
1055 let options = ExtractionOptions::default();
1056 assert!(!options.preserve_layout);
1057 assert_eq!(options.space_threshold, 0.3);
1058 assert_eq!(options.newline_threshold, 10.0);
1059 assert!(options.sort_by_position);
1060 assert!(!options.detect_columns);
1061 assert_eq!(options.column_threshold, 50.0);
1062 assert!(options.merge_hyphenated);
1063 }
1064
1065 #[test]
1066 fn test_extraction_options_custom() {
1067 let options = ExtractionOptions {
1068 preserve_layout: true,
1069 space_threshold: 0.5,
1070 newline_threshold: 15.0,
1071 sort_by_position: false,
1072 detect_columns: true,
1073 column_threshold: 75.0,
1074 merge_hyphenated: false,
1075 track_space_decisions: false,
1076 };
1077 assert!(options.preserve_layout);
1078 assert_eq!(options.space_threshold, 0.5);
1079 assert_eq!(options.newline_threshold, 15.0);
1080 assert!(!options.sort_by_position);
1081 assert!(options.detect_columns);
1082 assert_eq!(options.column_threshold, 75.0);
1083 assert!(!options.merge_hyphenated);
1084 }
1085
1086 #[test]
1087 fn test_parse_font_style_bold() {
1088 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1090 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1091
1092 assert_eq!(parse_font_style("Arial Bold"), (true, false));
1094 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1095
1096 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1098 }
1099
1100 #[test]
1101 fn test_parse_font_style_italic() {
1102 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1104 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1105
1106 assert_eq!(parse_font_style("Arial Italic"), (false, true));
1108 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1109
1110 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1112 }
1113
1114 #[test]
1115 fn test_parse_font_style_bold_italic() {
1116 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1117 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1118 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1119 }
1120
1121 #[test]
1122 fn test_parse_font_style_regular() {
1123 assert_eq!(parse_font_style("Helvetica"), (false, false));
1124 assert_eq!(parse_font_style("Times-Roman"), (false, false));
1125 assert_eq!(parse_font_style("Courier"), (false, false));
1126 assert_eq!(parse_font_style("Arial"), (false, false));
1127 }
1128
1129 #[test]
1130 fn test_parse_font_style_edge_cases() {
1131 assert_eq!(parse_font_style(""), (false, false));
1133 assert_eq!(parse_font_style("UnknownFont"), (false, false));
1134
1135 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1137 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1138 }
1139
1140 #[test]
1141 fn test_text_fragment() {
1142 let fragment = TextFragment {
1143 text: "Hello".to_string(),
1144 x: 100.0,
1145 y: 200.0,
1146 width: 50.0,
1147 height: 12.0,
1148 font_size: 10.0,
1149 font_name: None,
1150 is_bold: false,
1151 is_italic: false,
1152 color: None,
1153 space_decisions: Vec::new(),
1154 };
1155 assert_eq!(fragment.text, "Hello");
1156 assert_eq!(fragment.x, 100.0);
1157 assert_eq!(fragment.y, 200.0);
1158 assert_eq!(fragment.width, 50.0);
1159 assert_eq!(fragment.height, 12.0);
1160 assert_eq!(fragment.font_size, 10.0);
1161 }
1162
1163 #[test]
1164 fn test_extracted_text() {
1165 let fragments = vec![
1166 TextFragment {
1167 text: "Hello".to_string(),
1168 x: 100.0,
1169 y: 200.0,
1170 width: 50.0,
1171 height: 12.0,
1172 font_size: 10.0,
1173 font_name: None,
1174 is_bold: false,
1175 is_italic: false,
1176 color: None,
1177 space_decisions: Vec::new(),
1178 },
1179 TextFragment {
1180 text: "World".to_string(),
1181 x: 160.0,
1182 y: 200.0,
1183 width: 50.0,
1184 height: 12.0,
1185 font_size: 10.0,
1186 font_name: None,
1187 is_bold: false,
1188 is_italic: false,
1189 color: None,
1190 space_decisions: Vec::new(),
1191 },
1192 ];
1193
1194 let extracted = ExtractedText {
1195 text: "Hello World".to_string(),
1196 fragments: fragments,
1197 };
1198
1199 assert_eq!(extracted.text, "Hello World");
1200 assert_eq!(extracted.fragments.len(), 2);
1201 assert_eq!(extracted.fragments[0].text, "Hello");
1202 assert_eq!(extracted.fragments[1].text, "World");
1203 }
1204
1205 #[test]
1206 fn test_text_state_default() {
1207 let state = TextState::default();
1208 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1209 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1210 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1211 assert_eq!(state.leading, 0.0);
1212 assert_eq!(state.char_space, 0.0);
1213 assert_eq!(state.word_space, 0.0);
1214 assert_eq!(state.horizontal_scale, 100.0);
1215 assert_eq!(state.text_rise, 0.0);
1216 assert_eq!(state.font_size, 0.0);
1217 assert!(state.font_name.is_none());
1218 assert_eq!(state.render_mode, 0);
1219 }
1220
1221 #[test]
1222 fn test_matrix_operations() {
1223 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
1226 assert_eq!(x, 0.0);
1227 assert_eq!(y, 1.0);
1228
1229 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1231 let (x, y) = transform_point(5.0, 5.0, &scale);
1232 assert_eq!(x, 10.0);
1233 assert_eq!(y, 15.0);
1234
1235 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1237 let (x, y) = transform_point(1.0, 1.0, &complex);
1238 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
1241
1242 #[test]
1243 fn test_text_extractor_new() {
1244 let extractor = TextExtractor::new();
1245 let options = extractor.options;
1246 assert!(!options.preserve_layout);
1247 assert_eq!(options.space_threshold, 0.3);
1248 assert_eq!(options.newline_threshold, 10.0);
1249 assert!(options.sort_by_position);
1250 assert!(!options.detect_columns);
1251 assert_eq!(options.column_threshold, 50.0);
1252 assert!(options.merge_hyphenated);
1253 }
1254
1255 #[test]
1256 fn test_text_extractor_with_options() {
1257 let options = ExtractionOptions {
1258 preserve_layout: true,
1259 space_threshold: 0.3,
1260 newline_threshold: 12.0,
1261 sort_by_position: false,
1262 detect_columns: true,
1263 column_threshold: 60.0,
1264 merge_hyphenated: false,
1265 track_space_decisions: false,
1266 };
1267 let extractor = TextExtractor::with_options(options.clone());
1268 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1269 assert_eq!(extractor.options.space_threshold, options.space_threshold);
1270 assert_eq!(
1271 extractor.options.newline_threshold,
1272 options.newline_threshold
1273 );
1274 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1275 assert_eq!(extractor.options.detect_columns, options.detect_columns);
1276 assert_eq!(extractor.options.column_threshold, options.column_threshold);
1277 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1278 }
1279
1280 #[test]
1285 fn test_calculate_text_width_with_no_font_info() {
1286 let width = calculate_text_width("Hello", 12.0, None);
1288
1289 assert_eq!(
1291 width, 30.0,
1292 "Without font info, should use simplified calculation: len * font_size * 0.5"
1293 );
1294 }
1295
1296 #[test]
1297 fn test_calculate_text_width_with_empty_metrics() {
1298 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1299
1300 let font_info = FontInfo {
1302 name: "TestFont".to_string(),
1303 font_type: "Type1".to_string(),
1304 encoding: None,
1305 to_unicode: None,
1306 differences: None,
1307 descendant_font: None,
1308 cid_to_gid_map: None,
1309 cid_ordering: None,
1310 metrics: FontMetrics {
1311 first_char: None,
1312 last_char: None,
1313 widths: None,
1314 missing_width: Some(500.0),
1315 kerning: None,
1316 },
1317 };
1318
1319 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1320
1321 assert_eq!(
1323 width, 30.0,
1324 "Without widths array, should fall back to simplified calculation"
1325 );
1326 }
1327
1328 #[test]
1329 fn test_calculate_text_width_with_complete_metrics() {
1330 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1331
1332 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
1343 name: "Helvetica".to_string(),
1344 font_type: "Type1".to_string(),
1345 encoding: None,
1346 to_unicode: None,
1347 differences: None,
1348 descendant_font: None,
1349 cid_to_gid_map: None,
1350 cid_ordering: None,
1351 metrics: FontMetrics {
1352 first_char: Some(32),
1353 last_char: Some(126),
1354 widths: Some(widths),
1355 missing_width: Some(500.0),
1356 kerning: None,
1357 },
1358 };
1359
1360 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1361
1362 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1370 let tolerance = 0.0001; assert!(
1372 (width - expected).abs() < tolerance,
1373 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1374 expected,
1375 width,
1376 (width - expected).abs()
1377 );
1378
1379 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
1382 width, simplified,
1383 "Metrics-based calculation should differ from simplified (30.0)"
1384 );
1385 }
1386
1387 #[test]
1388 fn test_calculate_text_width_character_outside_range() {
1389 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1390
1391 let widths = vec![722.0; 26]; let font_info = FontInfo {
1395 name: "TestFont".to_string(),
1396 font_type: "Type1".to_string(),
1397 encoding: None,
1398 to_unicode: None,
1399 differences: None,
1400 descendant_font: None,
1401 cid_to_gid_map: None,
1402 cid_ordering: None,
1403 metrics: FontMetrics {
1404 first_char: Some(65), last_char: Some(90), widths: Some(widths),
1407 missing_width: Some(500.0),
1408 kerning: None,
1409 },
1410 };
1411
1412 let width = calculate_text_width("A1", 10.0, Some(&font_info));
1414
1415 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1420 assert_eq!(
1421 width, expected,
1422 "Should use missing_width for characters outside range"
1423 );
1424 }
1425
1426 #[test]
1427 fn test_calculate_text_width_missing_width_in_array() {
1428 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1429
1430 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1435 name: "TestFont".to_string(),
1436 font_type: "Type1".to_string(),
1437 encoding: None,
1438 to_unicode: None,
1439 differences: None,
1440 descendant_font: None,
1441 cid_to_gid_map: None,
1442 cid_ordering: None,
1443 metrics: FontMetrics {
1444 first_char: Some(32),
1445 last_char: Some(126),
1446 widths: Some(widths),
1447 missing_width: Some(600.0),
1448 kerning: None,
1449 },
1450 };
1451
1452 let char_code = 42u8 as char; let text = char_code.to_string();
1455 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1456
1457 assert_eq!(
1460 width, 0.0,
1461 "Should use 0.0 width from array, not missing_width"
1462 );
1463 }
1464
1465 #[test]
1466 fn test_calculate_text_width_empty_string() {
1467 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1468
1469 let font_info = FontInfo {
1470 name: "TestFont".to_string(),
1471 font_type: "Type1".to_string(),
1472 encoding: None,
1473 to_unicode: None,
1474 differences: None,
1475 descendant_font: None,
1476 cid_to_gid_map: None,
1477 cid_ordering: None,
1478 metrics: FontMetrics {
1479 first_char: Some(32),
1480 last_char: Some(126),
1481 widths: Some(vec![500.0; 95]),
1482 missing_width: Some(500.0),
1483 kerning: None,
1484 },
1485 };
1486
1487 let width = calculate_text_width("", 12.0, Some(&font_info));
1488 assert_eq!(width, 0.0, "Empty string should have zero width");
1489
1490 let width_no_font = calculate_text_width("", 12.0, None);
1492 assert_eq!(
1493 width_no_font, 0.0,
1494 "Empty string should have zero width (no font)"
1495 );
1496 }
1497
1498 #[test]
1499 fn test_calculate_text_width_unicode_characters() {
1500 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1501
1502 let font_info = FontInfo {
1504 name: "TestFont".to_string(),
1505 font_type: "Type1".to_string(),
1506 encoding: None,
1507 to_unicode: None,
1508 differences: None,
1509 descendant_font: None,
1510 cid_to_gid_map: None,
1511 cid_ordering: None,
1512 metrics: FontMetrics {
1513 first_char: Some(32),
1514 last_char: Some(126),
1515 widths: Some(vec![500.0; 95]),
1516 missing_width: Some(600.0),
1517 kerning: None,
1518 },
1519 };
1520
1521 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1523
1524 assert_eq!(
1527 width, 6.0,
1528 "Unicode character outside range should use missing_width"
1529 );
1530 }
1531
1532 #[test]
1533 fn test_calculate_text_width_different_font_sizes() {
1534 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1535
1536 let font_info = FontInfo {
1537 name: "TestFont".to_string(),
1538 font_type: "Type1".to_string(),
1539 encoding: None,
1540 to_unicode: None,
1541 differences: None,
1542 descendant_font: None,
1543 cid_to_gid_map: None,
1544 cid_ordering: None,
1545 metrics: FontMetrics {
1546 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1549 missing_width: Some(500.0),
1550 kerning: None,
1551 },
1552 };
1553
1554 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1556 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1557
1558 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1560 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1561 assert_eq!(
1562 width_20,
1563 width_10 * 2.0,
1564 "Width should scale linearly with font size"
1565 );
1566 }
1567
1568 #[test]
1569 fn test_calculate_text_width_proportional_vs_monospace() {
1570 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1571
1572 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1575 name: "Helvetica".to_string(),
1576 font_type: "Type1".to_string(),
1577 encoding: None,
1578 to_unicode: None,
1579 differences: None,
1580 descendant_font: None,
1581 cid_to_gid_map: None,
1582 cid_ordering: None,
1583 metrics: FontMetrics {
1584 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1587 missing_width: Some(500.0),
1588 kerning: None,
1589 },
1590 };
1591
1592 let monospace_widths = vec![600.0, 600.0, 600.0];
1594 let monospace_font = FontInfo {
1595 name: "Courier".to_string(),
1596 font_type: "Type1".to_string(),
1597 encoding: None,
1598 to_unicode: None,
1599 differences: None,
1600 descendant_font: None,
1601 cid_to_gid_map: None,
1602 cid_ordering: None,
1603 metrics: FontMetrics {
1604 first_char: Some(105),
1605 last_char: Some(107),
1606 widths: Some(monospace_widths),
1607 missing_width: Some(600.0),
1608 kerning: None,
1609 },
1610 };
1611
1612 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1613 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1614
1615 assert!(
1617 prop_width < mono_width,
1618 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1619 prop_width,
1620 mono_width
1621 );
1622 }
1623
1624 #[test]
1629 fn test_calculate_text_width_with_kerning() {
1630 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1631 use std::collections::HashMap;
1632
1633 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1640 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1645 name: "Helvetica".to_string(),
1646 font_type: "Type1".to_string(),
1647 encoding: None,
1648 to_unicode: None,
1649 differences: None,
1650 descendant_font: None,
1651 cid_to_gid_map: None,
1652 cid_ordering: None,
1653 metrics: FontMetrics {
1654 first_char: Some(32),
1655 last_char: Some(126),
1656 widths: Some(widths),
1657 missing_width: Some(500.0),
1658 kerning: Some(kerning),
1659 },
1660 };
1661
1662 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1664 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1667 let tolerance = 0.0001;
1668 assert!(
1669 (width_av - expected_av).abs() < tolerance,
1670 "AV with kerning: expected {}, got {}, diff {}",
1671 expected_av,
1672 width_av,
1673 (width_av - expected_av).abs()
1674 );
1675
1676 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1678 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1681 assert!(
1682 (width_aw - expected_aw).abs() < tolerance,
1683 "AW with kerning: expected {}, got {}, diff {}",
1684 expected_aw,
1685 width_aw,
1686 (width_aw - expected_aw).abs()
1687 );
1688
1689 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1691 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1693 assert!(
1694 (width_va - expected_va).abs() < tolerance,
1695 "VA without kerning: expected {}, got {}, diff {}",
1696 expected_va,
1697 width_va,
1698 (width_va - expected_va).abs()
1699 );
1700
1701 assert!(
1703 width_av < width_va,
1704 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1705 width_av,
1706 width_va
1707 );
1708 }
1709
1710 #[test]
1711 fn test_parse_truetype_kern_table_minimal() {
1712 use crate::text::extraction_cmap::parse_truetype_kern_table;
1713
1714 let mut ttf_data = vec![
1722 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1729
1730 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1744
1745 ttf_data.extend_from_slice(&[
1747 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1767
1768 let result = parse_truetype_kern_table(&ttf_data);
1769 assert!(
1770 result.is_ok(),
1771 "Should parse minimal kern table successfully: {:?}",
1772 result.err()
1773 );
1774
1775 let kerning_map = result.unwrap();
1776 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1777
1778 assert_eq!(
1780 kerning_map.get(&(65, 86)),
1781 Some(&-50.0),
1782 "Should have A+V kerning pair with value -50"
1783 );
1784
1785 assert_eq!(
1787 kerning_map.get(&(65, 87)),
1788 Some(&-40.0),
1789 "Should have A+W kerning pair with value -40"
1790 );
1791 }
1792
1793 #[test]
1794 fn test_parse_kern_table_no_kern_table() {
1795 use crate::text::extraction_cmap::extract_truetype_kerning;
1796
1797 let ttf_data = vec![
1802 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1815 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1816 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1817 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1818 ];
1819
1820 let result = extract_truetype_kerning(&ttf_data);
1821 assert!(
1822 result.is_ok(),
1823 "Should gracefully handle missing kern table"
1824 );
1825
1826 let kerning_map = result.unwrap();
1827 assert!(
1828 kerning_map.is_empty(),
1829 "Should return empty HashMap when no kern table exists"
1830 );
1831 }
1832}