1use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19 pub preserve_layout: bool,
21 pub space_threshold: f64,
23 pub newline_threshold: f64,
25 pub sort_by_position: bool,
27 pub detect_columns: bool,
29 pub column_threshold: f64,
31 pub merge_hyphenated: bool,
33 pub track_space_decisions: bool,
36}
37
38impl Default for ExtractionOptions {
39 fn default() -> Self {
40 Self {
41 preserve_layout: false,
42 space_threshold: 0.3,
43 newline_threshold: 10.0,
44 sort_by_position: true,
45 detect_columns: false,
46 column_threshold: 50.0,
47 merge_hyphenated: true,
48 track_space_decisions: false,
49 }
50 }
51}
52
53#[derive(Debug, Clone)]
55pub struct ExtractedText {
56 pub text: String,
58 pub fragments: Vec<TextFragment>,
60}
61
62#[derive(Debug, Clone)]
65pub struct SpaceDecision {
66 pub offset: usize,
68 pub dx: f64,
70 pub threshold: f64,
72 pub confidence: f64,
74 pub inserted: bool,
76}
77
78#[derive(Debug, Clone)]
80pub struct TextFragment {
81 pub text: String,
83 pub x: f64,
85 pub y: f64,
87 pub width: f64,
89 pub height: f64,
91 pub font_size: f64,
93 pub font_name: Option<String>,
95 pub is_bold: bool,
97 pub is_italic: bool,
99 pub color: Option<Color>,
101 pub space_decisions: Vec<SpaceDecision>,
103}
104
105struct TextState {
107 text_matrix: [f64; 6],
109 text_line_matrix: [f64; 6],
111 ctm: [f64; 6],
113 leading: f64,
115 char_space: f64,
117 word_space: f64,
119 horizontal_scale: f64,
121 text_rise: f64,
123 font_size: f64,
125 font_name: Option<String>,
127 render_mode: u8,
129 fill_color: Option<Color>,
131}
132
133impl Default for TextState {
134 fn default() -> Self {
135 Self {
136 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
137 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
138 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139 leading: 0.0,
140 char_space: 0.0,
141 word_space: 0.0,
142 horizontal_scale: 100.0,
143 text_rise: 0.0,
144 font_size: 0.0,
145 font_name: None,
146 render_mode: 0,
147 fill_color: None,
148 }
149 }
150}
151
152pub fn parse_font_style(font_name: &str) -> (bool, bool) {
173 let name_lower = font_name.to_lowercase();
174
175 let is_bold = name_lower.contains("bold")
177 || name_lower.contains("-b")
178 || name_lower.contains(" b ")
179 || name_lower.ends_with(" b");
180
181 let is_italic = name_lower.contains("italic")
183 || name_lower.contains("oblique")
184 || name_lower.contains("-i")
185 || name_lower.contains(" i ")
186 || name_lower.ends_with(" i");
187
188 (is_bold, is_italic)
189}
190
191pub struct TextExtractor {
193 options: ExtractionOptions,
194 font_cache: HashMap<String, FontInfo>,
196 font_object_cache: HashMap<(u32, u16), FontInfo>,
199}
200
201impl TextExtractor {
202 pub fn new() -> Self {
204 Self {
205 options: ExtractionOptions::default(),
206 font_cache: HashMap::new(),
207 font_object_cache: HashMap::new(),
208 }
209 }
210
211 pub fn with_options(options: ExtractionOptions) -> Self {
213 Self {
214 options,
215 font_cache: HashMap::new(),
216 font_object_cache: HashMap::new(),
217 }
218 }
219
220 pub fn extract_from_document<R: Read + Seek>(
222 &mut self,
223 document: &PdfDocument<R>,
224 ) -> ParseResult<Vec<ExtractedText>> {
225 let page_count = document.page_count()?;
226 let mut results = Vec::new();
227
228 for i in 0..page_count {
229 let text = self.extract_from_page(document, i)?;
230 results.push(text);
231 }
232
233 Ok(results)
234 }
235
236 pub fn extract_from_page<R: Read + Seek>(
238 &mut self,
239 document: &PdfDocument<R>,
240 page_index: u32,
241 ) -> ParseResult<ExtractedText> {
242 let page = document.get_page(page_index)?;
244
245 {
247 let _span = tracing::info_span!("font_resources").entered();
248 self.extract_font_resources(&page, document)?;
249 }
250
251 let streams = {
253 let _span = tracing::info_span!("stream_decompress").entered();
254 page.content_streams_with_document(document)?
255 };
256
257 let mut extracted_text = String::new();
258 let mut fragments = Vec::new();
259 let mut state = TextState::default();
260 let mut in_text_object = false;
261 let mut last_x = 0.0;
262 let mut last_y = 0.0;
263
264 for (stream_idx, stream_data) in streams.iter().enumerate() {
266 let operations = match {
267 let _span = tracing::info_span!("content_parse").entered();
268 ContentParser::parse_content(stream_data)
269 } {
270 Ok(ops) => ops,
271 Err(e) => {
272 tracing::debug!(
274 "Warning: Failed to parse content stream on page {}, stream {}/{}",
275 page_index + 1,
276 stream_idx + 1,
277 streams.len()
278 );
279 tracing::debug!(" Error: {}", e);
280 tracing::debug!(" Stream size: {} bytes", stream_data.len());
281
282 let preview_len = stream_data.len().min(100);
284 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
285 tracing::debug!(
286 " Stream preview (first {} bytes): {:?}",
287 preview_len,
288 preview.chars().take(80).collect::<String>()
289 );
290
291 continue;
293 }
294 };
295
296 let _ops_span = tracing::info_span!("text_ops_loop").entered();
297 for op in operations {
298 match op {
299 ContentOperation::BeginText => {
300 in_text_object = true;
301 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
303 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
304 }
305
306 ContentOperation::EndText => {
307 in_text_object = false;
308 }
309
310 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
311 state.text_matrix =
312 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
313 state.text_line_matrix =
314 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
315 }
316
317 ContentOperation::MoveText(tx, ty) => {
318 let new_matrix = multiply_matrix(
320 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
321 &state.text_line_matrix,
322 );
323 state.text_matrix = new_matrix;
324 state.text_line_matrix = new_matrix;
325 }
326
327 ContentOperation::NextLine => {
328 let new_matrix = multiply_matrix(
330 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
331 &state.text_line_matrix,
332 );
333 state.text_matrix = new_matrix;
334 state.text_line_matrix = new_matrix;
335 }
336
337 ContentOperation::ShowText(text) => {
338 if in_text_object {
339 let text_bytes = &text;
340 let decoded = self.decode_text(text_bytes, &state)?;
341
342 let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
345 let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
346
347 if !extracted_text.is_empty() {
349 let dx = x - last_x;
350 let dy = (y - last_y).abs();
351
352 if dy > self.options.newline_threshold {
353 extracted_text.push('\n');
354 } else if dx > self.options.space_threshold * state.font_size {
355 extracted_text.push(' ');
356 }
357 }
358
359 extracted_text.push_str(&decoded);
360
361 let font_info = state
363 .font_name
364 .as_ref()
365 .and_then(|name| self.font_cache.get(name));
366
367 let text_width =
369 calculate_text_width(&decoded, state.font_size, font_info);
370
371 if self.options.preserve_layout {
372 let (is_bold, is_italic) = state
374 .font_name
375 .as_ref()
376 .map(|name| parse_font_style(name))
377 .unwrap_or((false, false));
378
379 fragments.push(TextFragment {
380 text: decoded.clone(),
381 x,
382 y,
383 width: text_width,
384 height: state.font_size,
385 font_size: state.font_size,
386 font_name: state.font_name.clone(),
387 is_bold,
388 is_italic,
389 color: state.fill_color,
390 space_decisions: Vec::new(),
391 });
392 }
393
394 last_x = x + text_width;
396 last_y = y;
397
398 let tx = text_width * state.horizontal_scale / 100.0;
400 state.text_matrix =
401 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
402 }
403 }
404
405 ContentOperation::ShowTextArray(array) => {
406 if in_text_object {
407 let font_info = state
409 .font_name
410 .as_ref()
411 .and_then(|name| self.font_cache.get(name));
412
413 for item in array {
414 match item {
415 TextElement::Text(text_bytes) => {
416 let decoded = self.decode_text(&text_bytes, &state)?;
417 extracted_text.push_str(&decoded);
418
419 let text_width = calculate_text_width(
421 &decoded,
422 state.font_size,
423 font_info,
424 );
425 let tx = text_width * state.horizontal_scale / 100.0;
426 state.text_matrix = multiply_matrix(
427 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
428 &state.text_matrix,
429 );
430 }
431 TextElement::Spacing(adjustment) => {
432 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
434 state.text_matrix = multiply_matrix(
435 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
436 &state.text_matrix,
437 );
438 }
439 }
440 }
441 }
442 }
443
444 ContentOperation::SetFont(name, size) => {
445 state.font_name = Some(name);
446 state.font_size = size as f64;
447 }
448
449 ContentOperation::SetLeading(leading) => {
450 state.leading = leading as f64;
451 }
452
453 ContentOperation::SetCharSpacing(spacing) => {
454 state.char_space = spacing as f64;
455 }
456
457 ContentOperation::SetWordSpacing(spacing) => {
458 state.word_space = spacing as f64;
459 }
460
461 ContentOperation::SetHorizontalScaling(scale) => {
462 state.horizontal_scale = scale as f64;
463 }
464
465 ContentOperation::SetTextRise(rise) => {
466 state.text_rise = rise as f64;
467 }
468
469 ContentOperation::SetTextRenderMode(mode) => {
470 state.render_mode = mode as u8;
471 }
472
473 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
474 let [a0, b0, c0, d0, e0, f0] = state.ctm;
476 let a = a as f64;
477 let b = b as f64;
478 let c = c as f64;
479 let d = d as f64;
480 let e = e as f64;
481 let f = f as f64;
482 state.ctm = [
483 a * a0 + b * c0,
484 a * b0 + b * d0,
485 c * a0 + d * c0,
486 c * b0 + d * d0,
487 e * a0 + f * c0 + e0,
488 e * b0 + f * d0 + f0,
489 ];
490 }
491
492 ContentOperation::SetNonStrokingGray(gray) => {
494 state.fill_color = Some(Color::gray(gray as f64));
495 }
496
497 ContentOperation::SetNonStrokingRGB(r, g, b) => {
498 state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
499 }
500
501 ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
502 state.fill_color =
503 Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
504 }
505
506 _ => {
507 }
509 }
510 }
511 }
512
513 {
514 let _span = tracing::info_span!("layout_finalize").entered();
515
516 if self.options.sort_by_position && !fragments.is_empty() {
518 self.sort_and_merge_fragments(&mut fragments);
519 }
520
521 if self.options.preserve_layout && !fragments.is_empty() {
524 fragments = self.merge_close_fragments(&fragments);
525 }
526
527 if self.options.preserve_layout && !fragments.is_empty() {
529 extracted_text = self.reconstruct_text_from_fragments(&fragments);
530 }
531 }
532
533 Ok(ExtractedText {
534 text: extracted_text,
535 fragments,
536 })
537 }
538
539 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
541 let threshold = self.options.newline_threshold;
549 fragments.sort_by(|a, b| {
550 let band_a = if threshold > 0.0 {
552 (-a.y / threshold).round()
553 } else {
554 -a.y
555 };
556 let band_b = if threshold > 0.0 {
557 (-b.y / threshold).round()
558 } else {
559 -b.y
560 };
561
562 band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
564 });
565
566 if self.options.detect_columns {
568 self.detect_and_sort_columns(fragments);
569 }
570 }
571
572 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
574 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
576 let mut current_line: Vec<&mut TextFragment> = Vec::new();
577 let mut last_y = f64::INFINITY;
578
579 for fragment in fragments.iter_mut() {
580 let fragment_y = fragment.y;
581 if (last_y - fragment_y).abs() > self.options.newline_threshold
582 && !current_line.is_empty()
583 {
584 lines.push(current_line);
585 current_line = Vec::new();
586 }
587 current_line.push(fragment);
588 last_y = fragment_y;
589 }
590 if !current_line.is_empty() {
591 lines.push(current_line);
592 }
593
594 let mut column_boundaries = vec![0.0];
596 for line in &lines {
597 if line.len() > 1 {
598 for i in 0..line.len() - 1 {
599 let gap = line[i + 1].x - (line[i].x + line[i].width);
600 if gap > self.options.column_threshold {
601 let boundary = line[i].x + line[i].width + gap / 2.0;
602 if !column_boundaries
603 .iter()
604 .any(|&b| (b - boundary).abs() < 10.0)
605 {
606 column_boundaries.push(boundary);
607 }
608 }
609 }
610 }
611 }
612 column_boundaries.sort_by(|a, b| a.total_cmp(b));
613
614 if column_boundaries.len() > 1 {
616 fragments.sort_by(|a, b| {
617 let col_a = column_boundaries
619 .iter()
620 .position(|&boundary| a.x < boundary)
621 .unwrap_or(column_boundaries.len())
622 - 1;
623 let col_b = column_boundaries
624 .iter()
625 .position(|&boundary| b.x < boundary)
626 .unwrap_or(column_boundaries.len())
627 - 1;
628
629 if col_a != col_b {
630 col_a.cmp(&col_b)
631 } else {
632 b.y.total_cmp(&a.y)
634 }
635 });
636 }
637 }
638
639 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
641 let merged_fragments = self.merge_close_fragments(fragments);
643
644 let mut result = String::new();
645 let mut last_y = f64::INFINITY;
646 let mut last_x = 0.0;
647 let mut last_line_ended_with_hyphen = false;
648
649 for fragment in &merged_fragments {
650 let y_diff = (last_y - fragment.y).abs();
652 if !result.is_empty() && y_diff > self.options.newline_threshold {
653 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
655 if result.ends_with('-') {
657 result.pop();
658 }
659 } else {
660 result.push('\n');
661 }
662 } else if !result.is_empty() {
663 let x_gap = fragment.x - last_x;
665 if x_gap > self.options.space_threshold * fragment.font_size {
666 result.push(' ');
667 }
668 }
669
670 result.push_str(&fragment.text);
671 last_line_ended_with_hyphen = fragment.text.ends_with('-');
672 last_y = fragment.y;
673 last_x = fragment.x + fragment.width;
674 }
675
676 result
677 }
678
679 fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
682 if fragments.is_empty() {
683 return Vec::new();
684 }
685
686 let mut merged = Vec::new();
687 let mut current = fragments[0].clone();
688
689 for fragment in &fragments[1..] {
690 let y_diff = (current.y - fragment.y).abs();
692 let x_gap = fragment.x - (current.x + current.width);
693
694 let should_merge = y_diff < 1.0 && x_gap >= 0.0 && x_gap < fragment.font_size * 0.5; if should_merge {
701 current.text.push_str(&fragment.text);
703 current.width = (fragment.x + fragment.width) - current.x;
704 } else {
705 merged.push(current);
707 current = fragment.clone();
708 }
709 }
710
711 merged.push(current);
712 merged
713 }
714
715 fn extract_font_resources<R: Read + Seek>(
721 &mut self,
722 page: &ParsedPage,
723 document: &PdfDocument<R>,
724 ) -> ParseResult<()> {
725 self.font_cache.clear();
727
728 if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
731 if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
732 {
733 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
734 for (font_name, font_obj) in font_dict.0.iter() {
735 if let Some(font_ref) = font_obj.as_reference() {
736 self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
737 }
738 }
739 }
740 }
741 } else if let Some(resources) = page.get_resources() {
742 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
744 for (font_name, font_obj) in font_dict.0.iter() {
745 if let Some(font_ref) = font_obj.as_reference() {
746 self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
747 }
748 }
749 }
750 }
751
752 Ok(())
753 }
754
755 fn cache_font_by_ref<R: Read + Seek>(
757 &mut self,
758 font_name: &str,
759 font_ref: (u32, u16),
760 document: &PdfDocument<R>,
761 ) {
762 if let Some(cached) = self.font_object_cache.get(&font_ref) {
764 self.font_cache
765 .insert(font_name.to_string(), cached.clone());
766 tracing::debug!(
767 "Reused cached font object ({}, {}): {} (ToUnicode: {})",
768 font_ref.0,
769 font_ref.1,
770 font_name,
771 cached.to_unicode.is_some()
772 );
773 return;
774 }
775
776 if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
778 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
779 if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
780 let has_to_unicode = font_info.to_unicode.is_some();
781 self.font_object_cache.insert(font_ref, font_info.clone());
783 self.font_cache.insert(font_name.to_string(), font_info);
785 tracing::debug!(
786 "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
787 font_ref.0,
788 font_ref.1,
789 font_name,
790 has_to_unicode
791 );
792 }
793 }
794 }
795
796 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
798 use crate::text::encoding::TextEncoding;
799
800 if let Some(ref font_name) = state.font_name {
802 if let Some(font_info) = self.font_cache.get(font_name) {
803 if let Ok(decoded) =
805 crate::text::extraction_cmap::decode_text_with_font(text, font_info)
806 {
807 if !decoded.trim().is_empty()
809 && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
810 {
811 let sanitized = sanitize_extracted_text(&decoded);
813 tracing::debug!(
814 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
815 font_name,
816 text,
817 sanitized
818 );
819 return Ok(sanitized);
820 }
821 }
822
823 tracing::debug!(
824 "CMap decoding failed or produced garbage for font {}, falling back to encoding",
825 font_name
826 );
827 }
828 }
829
830 let encoding = if let Some(ref font_name) = state.font_name {
832 match font_name.to_lowercase().as_str() {
833 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
834 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
835 name if name.contains("standard") => TextEncoding::StandardEncoding,
836 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
837 _ => {
838 if font_name.starts_with("Times")
840 || font_name.starts_with("Helvetica")
841 || font_name.starts_with("Courier")
842 {
843 TextEncoding::WinAnsiEncoding } else {
845 TextEncoding::PdfDocEncoding }
847 }
848 }
849 } else {
850 TextEncoding::WinAnsiEncoding };
852
853 let fallback_result = encoding.decode(text);
854 let sanitized = sanitize_extracted_text(&fallback_result);
856 tracing::debug!(
857 "Fallback encoding decoding: {:?} -> \"{}\"",
858 text,
859 sanitized
860 );
861 Ok(sanitized)
862 }
863}
864
865impl Default for TextExtractor {
866 fn default() -> Self {
867 Self::new()
868 }
869}
870
871fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
873 [
874 a[0] * b[0] + a[1] * b[2],
875 a[0] * b[1] + a[1] * b[3],
876 a[2] * b[0] + a[3] * b[2],
877 a[2] * b[1] + a[3] * b[3],
878 a[4] * b[0] + a[5] * b[2] + b[4],
879 a[4] * b[1] + a[5] * b[3] + b[5],
880 ]
881}
882
883fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
885 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
886 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
887 (tx, ty)
888}
889
890fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
892 if let Some(font) = font_info {
894 if let Some(ref widths) = font.metrics.widths {
895 let first_char = font.metrics.first_char.unwrap_or(0);
896 let last_char = font.metrics.last_char.unwrap_or(255);
897 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
898
899 let mut total_width = 0.0;
900 let mut chars = text.chars().peekable();
901
902 while let Some(ch) = chars.next() {
903 let char_code = ch as u32;
904
905 let width = if char_code >= first_char && char_code <= last_char {
907 let index = (char_code - first_char) as usize;
908 widths.get(index).copied().unwrap_or(missing_width)
909 } else {
910 missing_width
911 };
912
913 total_width += width / 1000.0 * font_size;
915
916 if let Some(ref kerning) = font.metrics.kerning {
918 if let Some(&next_ch) = chars.peek() {
919 let next_char = next_ch as u32;
920 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
921 total_width += kern_value / 1000.0 * font_size;
923 }
924 }
925 }
926 }
927
928 return total_width;
929 }
930 }
931
932 text.len() as f64 * font_size * 0.5
934}
935
936pub fn sanitize_extracted_text(text: &str) -> String {
969 if text.is_empty() {
970 return String::new();
971 }
972
973 let mut result = String::with_capacity(text.len());
975 let mut chars = text.chars().peekable();
976 let mut last_was_space = false;
977
978 while let Some(ch) = chars.next() {
979 match ch {
980 '\0' => {
982 if chars.peek() == Some(&'\u{3}') {
984 chars.next(); }
986 if !last_was_space {
988 result.push(' ');
989 last_was_space = true;
990 }
991 }
992
993 '\u{3}' => {
995 }
997
998 '\t' | '\n' | '\r' => {
1000 result.push(ch);
1001 last_was_space = ch == '\t';
1003 }
1004
1005 ' ' => {
1007 if !last_was_space {
1008 result.push(' ');
1009 last_was_space = true;
1010 }
1011 }
1012
1013 c if c.is_ascii_control() => {
1015 }
1017
1018 _ => {
1020 result.push(ch);
1021 last_was_space = false;
1022 }
1023 }
1024 }
1025
1026 result
1027}
1028
1029#[cfg(test)]
1030mod tests {
1031 use super::*;
1032
1033 #[test]
1034 fn test_matrix_multiplication() {
1035 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1036 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1037
1038 let result = multiply_matrix(&identity, &translation);
1039 assert_eq!(result, translation);
1040
1041 let result2 = multiply_matrix(&translation, &identity);
1042 assert_eq!(result2, translation);
1043 }
1044
1045 #[test]
1046 fn test_transform_point() {
1047 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
1048 let (x, y) = transform_point(5.0, 5.0, &translation);
1049 assert_eq!(x, 15.0);
1050 assert_eq!(y, 25.0);
1051 }
1052
1053 #[test]
1054 fn test_extraction_options_default() {
1055 let options = ExtractionOptions::default();
1056 assert!(!options.preserve_layout);
1057 assert_eq!(options.space_threshold, 0.3);
1058 assert_eq!(options.newline_threshold, 10.0);
1059 assert!(options.sort_by_position);
1060 assert!(!options.detect_columns);
1061 assert_eq!(options.column_threshold, 50.0);
1062 assert!(options.merge_hyphenated);
1063 }
1064
1065 #[test]
1066 fn test_extraction_options_custom() {
1067 let options = ExtractionOptions {
1068 preserve_layout: true,
1069 space_threshold: 0.5,
1070 newline_threshold: 15.0,
1071 sort_by_position: false,
1072 detect_columns: true,
1073 column_threshold: 75.0,
1074 merge_hyphenated: false,
1075 track_space_decisions: false,
1076 };
1077 assert!(options.preserve_layout);
1078 assert_eq!(options.space_threshold, 0.5);
1079 assert_eq!(options.newline_threshold, 15.0);
1080 assert!(!options.sort_by_position);
1081 assert!(options.detect_columns);
1082 assert_eq!(options.column_threshold, 75.0);
1083 assert!(!options.merge_hyphenated);
1084 }
1085
1086 #[test]
1087 fn test_parse_font_style_bold() {
1088 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
1090 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
1091
1092 assert_eq!(parse_font_style("Arial Bold"), (true, false));
1094 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
1095
1096 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
1098 }
1099
1100 #[test]
1101 fn test_parse_font_style_italic() {
1102 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
1104 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
1105
1106 assert_eq!(parse_font_style("Arial Italic"), (false, true));
1108 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
1109
1110 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
1112 }
1113
1114 #[test]
1115 fn test_parse_font_style_bold_italic() {
1116 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
1117 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
1118 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
1119 }
1120
1121 #[test]
1122 fn test_parse_font_style_regular() {
1123 assert_eq!(parse_font_style("Helvetica"), (false, false));
1124 assert_eq!(parse_font_style("Times-Roman"), (false, false));
1125 assert_eq!(parse_font_style("Courier"), (false, false));
1126 assert_eq!(parse_font_style("Arial"), (false, false));
1127 }
1128
1129 #[test]
1130 fn test_parse_font_style_edge_cases() {
1131 assert_eq!(parse_font_style(""), (false, false));
1133 assert_eq!(parse_font_style("UnknownFont"), (false, false));
1134
1135 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
1137 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
1138 }
1139
1140 #[test]
1141 fn test_text_fragment() {
1142 let fragment = TextFragment {
1143 text: "Hello".to_string(),
1144 x: 100.0,
1145 y: 200.0,
1146 width: 50.0,
1147 height: 12.0,
1148 font_size: 10.0,
1149 font_name: None,
1150 is_bold: false,
1151 is_italic: false,
1152 color: None,
1153 space_decisions: Vec::new(),
1154 };
1155 assert_eq!(fragment.text, "Hello");
1156 assert_eq!(fragment.x, 100.0);
1157 assert_eq!(fragment.y, 200.0);
1158 assert_eq!(fragment.width, 50.0);
1159 assert_eq!(fragment.height, 12.0);
1160 assert_eq!(fragment.font_size, 10.0);
1161 }
1162
1163 #[test]
1164 fn test_extracted_text() {
1165 let fragments = vec![
1166 TextFragment {
1167 text: "Hello".to_string(),
1168 x: 100.0,
1169 y: 200.0,
1170 width: 50.0,
1171 height: 12.0,
1172 font_size: 10.0,
1173 font_name: None,
1174 is_bold: false,
1175 is_italic: false,
1176 color: None,
1177 space_decisions: Vec::new(),
1178 },
1179 TextFragment {
1180 text: "World".to_string(),
1181 x: 160.0,
1182 y: 200.0,
1183 width: 50.0,
1184 height: 12.0,
1185 font_size: 10.0,
1186 font_name: None,
1187 is_bold: false,
1188 is_italic: false,
1189 color: None,
1190 space_decisions: Vec::new(),
1191 },
1192 ];
1193
1194 let extracted = ExtractedText {
1195 text: "Hello World".to_string(),
1196 fragments: fragments,
1197 };
1198
1199 assert_eq!(extracted.text, "Hello World");
1200 assert_eq!(extracted.fragments.len(), 2);
1201 assert_eq!(extracted.fragments[0].text, "Hello");
1202 assert_eq!(extracted.fragments[1].text, "World");
1203 }
1204
1205 #[test]
1206 fn test_text_state_default() {
1207 let state = TextState::default();
1208 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1209 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1210 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1211 assert_eq!(state.leading, 0.0);
1212 assert_eq!(state.char_space, 0.0);
1213 assert_eq!(state.word_space, 0.0);
1214 assert_eq!(state.horizontal_scale, 100.0);
1215 assert_eq!(state.text_rise, 0.0);
1216 assert_eq!(state.font_size, 0.0);
1217 assert!(state.font_name.is_none());
1218 assert_eq!(state.render_mode, 0);
1219 }
1220
1221 #[test]
1222 fn test_matrix_operations() {
1223 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
1226 assert_eq!(x, 0.0);
1227 assert_eq!(y, 1.0);
1228
1229 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1231 let (x, y) = transform_point(5.0, 5.0, &scale);
1232 assert_eq!(x, 10.0);
1233 assert_eq!(y, 15.0);
1234
1235 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1237 let (x, y) = transform_point(1.0, 1.0, &complex);
1238 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
1241
1242 #[test]
1243 fn test_text_extractor_new() {
1244 let extractor = TextExtractor::new();
1245 let options = extractor.options;
1246 assert!(!options.preserve_layout);
1247 assert_eq!(options.space_threshold, 0.3);
1248 assert_eq!(options.newline_threshold, 10.0);
1249 assert!(options.sort_by_position);
1250 assert!(!options.detect_columns);
1251 assert_eq!(options.column_threshold, 50.0);
1252 assert!(options.merge_hyphenated);
1253 }
1254
1255 #[test]
1256 fn test_text_extractor_with_options() {
1257 let options = ExtractionOptions {
1258 preserve_layout: true,
1259 space_threshold: 0.3,
1260 newline_threshold: 12.0,
1261 sort_by_position: false,
1262 detect_columns: true,
1263 column_threshold: 60.0,
1264 merge_hyphenated: false,
1265 track_space_decisions: false,
1266 };
1267 let extractor = TextExtractor::with_options(options.clone());
1268 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1269 assert_eq!(extractor.options.space_threshold, options.space_threshold);
1270 assert_eq!(
1271 extractor.options.newline_threshold,
1272 options.newline_threshold
1273 );
1274 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1275 assert_eq!(extractor.options.detect_columns, options.detect_columns);
1276 assert_eq!(extractor.options.column_threshold, options.column_threshold);
1277 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1278 }
1279
1280 #[test]
1285 fn test_calculate_text_width_with_no_font_info() {
1286 let width = calculate_text_width("Hello", 12.0, None);
1288
1289 assert_eq!(
1291 width, 30.0,
1292 "Without font info, should use simplified calculation: len * font_size * 0.5"
1293 );
1294 }
1295
1296 #[test]
1297 fn test_calculate_text_width_with_empty_metrics() {
1298 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1299
1300 let font_info = FontInfo {
1302 name: "TestFont".to_string(),
1303 font_type: "Type1".to_string(),
1304 encoding: None,
1305 to_unicode: None,
1306 differences: None,
1307 descendant_font: None,
1308 cid_to_gid_map: None,
1309 metrics: FontMetrics {
1310 first_char: None,
1311 last_char: None,
1312 widths: None,
1313 missing_width: Some(500.0),
1314 kerning: None,
1315 },
1316 };
1317
1318 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1319
1320 assert_eq!(
1322 width, 30.0,
1323 "Without widths array, should fall back to simplified calculation"
1324 );
1325 }
1326
1327 #[test]
1328 fn test_calculate_text_width_with_complete_metrics() {
1329 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1330
1331 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
1342 name: "Helvetica".to_string(),
1343 font_type: "Type1".to_string(),
1344 encoding: None,
1345 to_unicode: None,
1346 differences: None,
1347 descendant_font: None,
1348 cid_to_gid_map: None,
1349 metrics: FontMetrics {
1350 first_char: Some(32),
1351 last_char: Some(126),
1352 widths: Some(widths),
1353 missing_width: Some(500.0),
1354 kerning: None,
1355 },
1356 };
1357
1358 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1359
1360 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1368 let tolerance = 0.0001; assert!(
1370 (width - expected).abs() < tolerance,
1371 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1372 expected,
1373 width,
1374 (width - expected).abs()
1375 );
1376
1377 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
1380 width, simplified,
1381 "Metrics-based calculation should differ from simplified (30.0)"
1382 );
1383 }
1384
1385 #[test]
1386 fn test_calculate_text_width_character_outside_range() {
1387 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1388
1389 let widths = vec![722.0; 26]; let font_info = FontInfo {
1393 name: "TestFont".to_string(),
1394 font_type: "Type1".to_string(),
1395 encoding: None,
1396 to_unicode: None,
1397 differences: None,
1398 descendant_font: None,
1399 cid_to_gid_map: None,
1400 metrics: FontMetrics {
1401 first_char: Some(65), last_char: Some(90), widths: Some(widths),
1404 missing_width: Some(500.0),
1405 kerning: None,
1406 },
1407 };
1408
1409 let width = calculate_text_width("A1", 10.0, Some(&font_info));
1411
1412 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1417 assert_eq!(
1418 width, expected,
1419 "Should use missing_width for characters outside range"
1420 );
1421 }
1422
1423 #[test]
1424 fn test_calculate_text_width_missing_width_in_array() {
1425 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1426
1427 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1432 name: "TestFont".to_string(),
1433 font_type: "Type1".to_string(),
1434 encoding: None,
1435 to_unicode: None,
1436 differences: None,
1437 descendant_font: None,
1438 cid_to_gid_map: None,
1439 metrics: FontMetrics {
1440 first_char: Some(32),
1441 last_char: Some(126),
1442 widths: Some(widths),
1443 missing_width: Some(600.0),
1444 kerning: None,
1445 },
1446 };
1447
1448 let char_code = 42u8 as char; let text = char_code.to_string();
1451 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1452
1453 assert_eq!(
1456 width, 0.0,
1457 "Should use 0.0 width from array, not missing_width"
1458 );
1459 }
1460
1461 #[test]
1462 fn test_calculate_text_width_empty_string() {
1463 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1464
1465 let font_info = FontInfo {
1466 name: "TestFont".to_string(),
1467 font_type: "Type1".to_string(),
1468 encoding: None,
1469 to_unicode: None,
1470 differences: None,
1471 descendant_font: None,
1472 cid_to_gid_map: None,
1473 metrics: FontMetrics {
1474 first_char: Some(32),
1475 last_char: Some(126),
1476 widths: Some(vec![500.0; 95]),
1477 missing_width: Some(500.0),
1478 kerning: None,
1479 },
1480 };
1481
1482 let width = calculate_text_width("", 12.0, Some(&font_info));
1483 assert_eq!(width, 0.0, "Empty string should have zero width");
1484
1485 let width_no_font = calculate_text_width("", 12.0, None);
1487 assert_eq!(
1488 width_no_font, 0.0,
1489 "Empty string should have zero width (no font)"
1490 );
1491 }
1492
1493 #[test]
1494 fn test_calculate_text_width_unicode_characters() {
1495 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1496
1497 let font_info = FontInfo {
1499 name: "TestFont".to_string(),
1500 font_type: "Type1".to_string(),
1501 encoding: None,
1502 to_unicode: None,
1503 differences: None,
1504 descendant_font: None,
1505 cid_to_gid_map: None,
1506 metrics: FontMetrics {
1507 first_char: Some(32),
1508 last_char: Some(126),
1509 widths: Some(vec![500.0; 95]),
1510 missing_width: Some(600.0),
1511 kerning: None,
1512 },
1513 };
1514
1515 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1517
1518 assert_eq!(
1521 width, 6.0,
1522 "Unicode character outside range should use missing_width"
1523 );
1524 }
1525
1526 #[test]
1527 fn test_calculate_text_width_different_font_sizes() {
1528 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1529
1530 let font_info = FontInfo {
1531 name: "TestFont".to_string(),
1532 font_type: "Type1".to_string(),
1533 encoding: None,
1534 to_unicode: None,
1535 differences: None,
1536 descendant_font: None,
1537 cid_to_gid_map: None,
1538 metrics: FontMetrics {
1539 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1542 missing_width: Some(500.0),
1543 kerning: None,
1544 },
1545 };
1546
1547 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1549 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1550
1551 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1553 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1554 assert_eq!(
1555 width_20,
1556 width_10 * 2.0,
1557 "Width should scale linearly with font size"
1558 );
1559 }
1560
1561 #[test]
1562 fn test_calculate_text_width_proportional_vs_monospace() {
1563 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1564
1565 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1568 name: "Helvetica".to_string(),
1569 font_type: "Type1".to_string(),
1570 encoding: None,
1571 to_unicode: None,
1572 differences: None,
1573 descendant_font: None,
1574 cid_to_gid_map: None,
1575 metrics: FontMetrics {
1576 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1579 missing_width: Some(500.0),
1580 kerning: None,
1581 },
1582 };
1583
1584 let monospace_widths = vec![600.0, 600.0, 600.0];
1586 let monospace_font = FontInfo {
1587 name: "Courier".to_string(),
1588 font_type: "Type1".to_string(),
1589 encoding: None,
1590 to_unicode: None,
1591 differences: None,
1592 descendant_font: None,
1593 cid_to_gid_map: None,
1594 metrics: FontMetrics {
1595 first_char: Some(105),
1596 last_char: Some(107),
1597 widths: Some(monospace_widths),
1598 missing_width: Some(600.0),
1599 kerning: None,
1600 },
1601 };
1602
1603 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1604 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1605
1606 assert!(
1608 prop_width < mono_width,
1609 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1610 prop_width,
1611 mono_width
1612 );
1613 }
1614
1615 #[test]
1620 fn test_calculate_text_width_with_kerning() {
1621 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1622 use std::collections::HashMap;
1623
1624 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1631 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1636 name: "Helvetica".to_string(),
1637 font_type: "Type1".to_string(),
1638 encoding: None,
1639 to_unicode: None,
1640 differences: None,
1641 descendant_font: None,
1642 cid_to_gid_map: None,
1643 metrics: FontMetrics {
1644 first_char: Some(32),
1645 last_char: Some(126),
1646 widths: Some(widths),
1647 missing_width: Some(500.0),
1648 kerning: Some(kerning),
1649 },
1650 };
1651
1652 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1654 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1657 let tolerance = 0.0001;
1658 assert!(
1659 (width_av - expected_av).abs() < tolerance,
1660 "AV with kerning: expected {}, got {}, diff {}",
1661 expected_av,
1662 width_av,
1663 (width_av - expected_av).abs()
1664 );
1665
1666 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1668 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1671 assert!(
1672 (width_aw - expected_aw).abs() < tolerance,
1673 "AW with kerning: expected {}, got {}, diff {}",
1674 expected_aw,
1675 width_aw,
1676 (width_aw - expected_aw).abs()
1677 );
1678
1679 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1681 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1683 assert!(
1684 (width_va - expected_va).abs() < tolerance,
1685 "VA without kerning: expected {}, got {}, diff {}",
1686 expected_va,
1687 width_va,
1688 (width_va - expected_va).abs()
1689 );
1690
1691 assert!(
1693 width_av < width_va,
1694 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1695 width_av,
1696 width_va
1697 );
1698 }
1699
1700 #[test]
1701 fn test_parse_truetype_kern_table_minimal() {
1702 use crate::text::extraction_cmap::parse_truetype_kern_table;
1703
1704 let mut ttf_data = vec![
1712 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1719
1720 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1734
1735 ttf_data.extend_from_slice(&[
1737 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1757
1758 let result = parse_truetype_kern_table(&ttf_data);
1759 assert!(
1760 result.is_ok(),
1761 "Should parse minimal kern table successfully: {:?}",
1762 result.err()
1763 );
1764
1765 let kerning_map = result.unwrap();
1766 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1767
1768 assert_eq!(
1770 kerning_map.get(&(65, 86)),
1771 Some(&-50.0),
1772 "Should have A+V kerning pair with value -50"
1773 );
1774
1775 assert_eq!(
1777 kerning_map.get(&(65, 87)),
1778 Some(&-40.0),
1779 "Should have A+W kerning pair with value -40"
1780 );
1781 }
1782
1783 #[test]
1784 fn test_parse_kern_table_no_kern_table() {
1785 use crate::text::extraction_cmap::extract_truetype_kerning;
1786
1787 let ttf_data = vec![
1792 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1805 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1806 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1807 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1808 ];
1809
1810 let result = extract_truetype_kerning(&ttf_data);
1811 assert!(
1812 result.is_ok(),
1813 "Should gracefully handle missing kern table"
1814 );
1815
1816 let kerning_map = result.unwrap();
1817 assert!(
1818 kerning_map.is_empty(),
1819 "Should return empty HashMap when no kern table exists"
1820 );
1821 }
1822}