1use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19 pub preserve_layout: bool,
21 pub space_threshold: f64,
23 pub newline_threshold: f64,
25 pub sort_by_position: bool,
27 pub detect_columns: bool,
29 pub column_threshold: f64,
31 pub merge_hyphenated: bool,
33}
34
35impl Default for ExtractionOptions {
36 fn default() -> Self {
37 Self {
38 preserve_layout: false,
39 space_threshold: 0.2,
40 newline_threshold: 10.0,
41 sort_by_position: true,
42 detect_columns: false,
43 column_threshold: 50.0,
44 merge_hyphenated: true,
45 }
46 }
47}
48
49#[derive(Debug, Clone)]
51pub struct ExtractedText {
52 pub text: String,
54 pub fragments: Vec<TextFragment>,
56}
57
58#[derive(Debug, Clone)]
60pub struct TextFragment {
61 pub text: String,
63 pub x: f64,
65 pub y: f64,
67 pub width: f64,
69 pub height: f64,
71 pub font_size: f64,
73 pub font_name: Option<String>,
75 pub is_bold: bool,
77 pub is_italic: bool,
79 pub color: Option<Color>,
81}
82
83struct TextState {
85 text_matrix: [f64; 6],
87 text_line_matrix: [f64; 6],
89 ctm: [f64; 6],
91 leading: f64,
93 char_space: f64,
95 word_space: f64,
97 horizontal_scale: f64,
99 text_rise: f64,
101 font_size: f64,
103 font_name: Option<String>,
105 render_mode: u8,
107 fill_color: Option<Color>,
109}
110
111impl Default for TextState {
112 fn default() -> Self {
113 Self {
114 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
115 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
116 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
117 leading: 0.0,
118 char_space: 0.0,
119 word_space: 0.0,
120 horizontal_scale: 100.0,
121 text_rise: 0.0,
122 font_size: 0.0,
123 font_name: None,
124 render_mode: 0,
125 fill_color: None,
126 }
127 }
128}
129
130pub fn parse_font_style(font_name: &str) -> (bool, bool) {
151 let name_lower = font_name.to_lowercase();
152
153 let is_bold = name_lower.contains("bold")
155 || name_lower.contains("-b")
156 || name_lower.contains(" b ")
157 || name_lower.ends_with(" b");
158
159 let is_italic = name_lower.contains("italic")
161 || name_lower.contains("oblique")
162 || name_lower.contains("-i")
163 || name_lower.contains(" i ")
164 || name_lower.ends_with(" i");
165
166 (is_bold, is_italic)
167}
168
169pub struct TextExtractor {
171 options: ExtractionOptions,
172 font_cache: HashMap<String, FontInfo>,
174}
175
176impl TextExtractor {
177 pub fn new() -> Self {
179 Self {
180 options: ExtractionOptions::default(),
181 font_cache: HashMap::new(),
182 }
183 }
184
185 pub fn with_options(options: ExtractionOptions) -> Self {
187 Self {
188 options,
189 font_cache: HashMap::new(),
190 }
191 }
192
193 pub fn extract_from_document<R: Read + Seek>(
195 &mut self,
196 document: &PdfDocument<R>,
197 ) -> ParseResult<Vec<ExtractedText>> {
198 let page_count = document.page_count()?;
199 let mut results = Vec::new();
200
201 for i in 0..page_count {
202 let text = self.extract_from_page(document, i)?;
203 results.push(text);
204 }
205
206 Ok(results)
207 }
208
209 pub fn extract_from_page<R: Read + Seek>(
211 &mut self,
212 document: &PdfDocument<R>,
213 page_index: u32,
214 ) -> ParseResult<ExtractedText> {
215 let page = document.get_page(page_index)?;
217
218 self.extract_font_resources(&page, document)?;
220
221 let streams = page.content_streams_with_document(document)?;
223
224 let mut extracted_text = String::new();
225 let mut fragments = Vec::new();
226 let mut state = TextState::default();
227 let mut in_text_object = false;
228 let mut last_x = 0.0;
229 let mut last_y = 0.0;
230
231 for (stream_idx, stream_data) in streams.iter().enumerate() {
233 let operations = match ContentParser::parse_content(stream_data) {
234 Ok(ops) => ops,
235 Err(e) => {
236 eprintln!(
238 "Warning: Failed to parse content stream on page {}, stream {}/{}",
239 page_index + 1,
240 stream_idx + 1,
241 streams.len()
242 );
243 eprintln!(" Error: {}", e);
244 eprintln!(" Stream size: {} bytes", stream_data.len());
245
246 let preview_len = stream_data.len().min(100);
248 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
249 eprintln!(
250 " Stream preview (first {} bytes): {:?}",
251 preview_len,
252 preview.chars().take(80).collect::<String>()
253 );
254
255 continue;
257 }
258 };
259
260 for op in operations {
261 match op {
262 ContentOperation::BeginText => {
263 in_text_object = true;
264 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
266 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
267 }
268
269 ContentOperation::EndText => {
270 in_text_object = false;
271 }
272
273 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
274 state.text_matrix =
275 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
276 state.text_line_matrix =
277 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
278 }
279
280 ContentOperation::MoveText(tx, ty) => {
281 let new_matrix = multiply_matrix(
283 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
284 &state.text_line_matrix,
285 );
286 state.text_matrix = new_matrix;
287 state.text_line_matrix = new_matrix;
288 }
289
290 ContentOperation::NextLine => {
291 let new_matrix = multiply_matrix(
293 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
294 &state.text_line_matrix,
295 );
296 state.text_matrix = new_matrix;
297 state.text_line_matrix = new_matrix;
298 }
299
300 ContentOperation::ShowText(text) => {
301 if in_text_object {
302 let text_bytes = &text;
303 let decoded = self.decode_text(text_bytes, &state)?;
304
305 let combined_matrix = multiply_matrix(&state.ctm, &state.text_matrix);
308 let (x, y) = transform_point(0.0, 0.0, &combined_matrix);
309
310 if !extracted_text.is_empty() {
312 let dx = x - last_x;
313 let dy = (y - last_y).abs();
314
315 if dy > self.options.newline_threshold {
316 extracted_text.push('\n');
317 } else if dx > self.options.space_threshold * state.font_size {
318 extracted_text.push(' ');
319 }
320 }
321
322 extracted_text.push_str(&decoded);
323
324 let font_info = state
326 .font_name
327 .as_ref()
328 .and_then(|name| self.font_cache.get(name));
329
330 if self.options.preserve_layout {
331 let (is_bold, is_italic) = state
333 .font_name
334 .as_ref()
335 .map(|name| parse_font_style(name))
336 .unwrap_or((false, false));
337
338 fragments.push(TextFragment {
339 text: decoded.clone(),
340 x,
341 y,
342 width: calculate_text_width(
343 &decoded,
344 state.font_size,
345 font_info,
346 ),
347 height: state.font_size,
348 font_size: state.font_size,
349 font_name: state.font_name.clone(),
350 is_bold,
351 is_italic,
352 color: state.fill_color,
353 });
354 }
355
356 last_x = x + calculate_text_width(&decoded, state.font_size, font_info);
358 last_y = y;
359
360 let text_width =
362 calculate_text_width(&decoded, state.font_size, font_info);
363 let tx = text_width * state.horizontal_scale / 100.0;
364 state.text_matrix =
365 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
366 }
367 }
368
369 ContentOperation::ShowTextArray(array) => {
370 if in_text_object {
371 let font_info = state
373 .font_name
374 .as_ref()
375 .and_then(|name| self.font_cache.get(name));
376
377 for item in array {
378 match item {
379 TextElement::Text(text_bytes) => {
380 let decoded = self.decode_text(&text_bytes, &state)?;
381 extracted_text.push_str(&decoded);
382
383 let text_width = calculate_text_width(
385 &decoded,
386 state.font_size,
387 font_info,
388 );
389 let tx = text_width * state.horizontal_scale / 100.0;
390 state.text_matrix = multiply_matrix(
391 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
392 &state.text_matrix,
393 );
394 }
395 TextElement::Spacing(adjustment) => {
396 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
398 state.text_matrix = multiply_matrix(
399 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
400 &state.text_matrix,
401 );
402 }
403 }
404 }
405 }
406 }
407
408 ContentOperation::SetFont(name, size) => {
409 state.font_name = Some(name);
410 state.font_size = size as f64;
411 }
412
413 ContentOperation::SetLeading(leading) => {
414 state.leading = leading as f64;
415 }
416
417 ContentOperation::SetCharSpacing(spacing) => {
418 state.char_space = spacing as f64;
419 }
420
421 ContentOperation::SetWordSpacing(spacing) => {
422 state.word_space = spacing as f64;
423 }
424
425 ContentOperation::SetHorizontalScaling(scale) => {
426 state.horizontal_scale = scale as f64;
427 }
428
429 ContentOperation::SetTextRise(rise) => {
430 state.text_rise = rise as f64;
431 }
432
433 ContentOperation::SetTextRenderMode(mode) => {
434 state.render_mode = mode as u8;
435 }
436
437 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
438 let [a0, b0, c0, d0, e0, f0] = state.ctm;
440 let a = a as f64;
441 let b = b as f64;
442 let c = c as f64;
443 let d = d as f64;
444 let e = e as f64;
445 let f = f as f64;
446 state.ctm = [
447 a * a0 + b * c0,
448 a * b0 + b * d0,
449 c * a0 + d * c0,
450 c * b0 + d * d0,
451 e * a0 + f * c0 + e0,
452 e * b0 + f * d0 + f0,
453 ];
454 }
455
456 ContentOperation::SetNonStrokingGray(gray) => {
458 state.fill_color = Some(Color::gray(gray as f64));
459 }
460
461 ContentOperation::SetNonStrokingRGB(r, g, b) => {
462 state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
463 }
464
465 ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
466 state.fill_color =
467 Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
468 }
469
470 _ => {
471 }
473 }
474 }
475 }
476
477 if self.options.sort_by_position && !fragments.is_empty() {
479 self.sort_and_merge_fragments(&mut fragments);
480 }
481
482 if self.options.preserve_layout && !fragments.is_empty() {
485 fragments = self.merge_close_fragments(&fragments);
486 }
487
488 if self.options.preserve_layout && !fragments.is_empty() {
490 extracted_text = self.reconstruct_text_from_fragments(&fragments);
491 }
492
493 Ok(ExtractedText {
494 text: extracted_text,
495 fragments,
496 })
497 }
498
499 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
501 fragments.sort_by(|a, b| {
503 let y_diff = (b.y - a.y).abs();
505 if y_diff < self.options.newline_threshold {
506 a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
508 } else {
509 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
511 }
512 });
513
514 if self.options.detect_columns {
516 self.detect_and_sort_columns(fragments);
517 }
518 }
519
520 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
522 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
524 let mut current_line: Vec<&mut TextFragment> = Vec::new();
525 let mut last_y = f64::INFINITY;
526
527 for fragment in fragments.iter_mut() {
528 let fragment_y = fragment.y;
529 if (last_y - fragment_y).abs() > self.options.newline_threshold
530 && !current_line.is_empty()
531 {
532 lines.push(current_line);
533 current_line = Vec::new();
534 }
535 current_line.push(fragment);
536 last_y = fragment_y;
537 }
538 if !current_line.is_empty() {
539 lines.push(current_line);
540 }
541
542 let mut column_boundaries = vec![0.0];
544 for line in &lines {
545 if line.len() > 1 {
546 for i in 0..line.len() - 1 {
547 let gap = line[i + 1].x - (line[i].x + line[i].width);
548 if gap > self.options.column_threshold {
549 let boundary = line[i].x + line[i].width + gap / 2.0;
550 if !column_boundaries
551 .iter()
552 .any(|&b| (b - boundary).abs() < 10.0)
553 {
554 column_boundaries.push(boundary);
555 }
556 }
557 }
558 }
559 }
560 column_boundaries.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
561
562 if column_boundaries.len() > 1 {
564 fragments.sort_by(|a, b| {
565 let col_a = column_boundaries
567 .iter()
568 .position(|&boundary| a.x < boundary)
569 .unwrap_or(column_boundaries.len())
570 - 1;
571 let col_b = column_boundaries
572 .iter()
573 .position(|&boundary| b.x < boundary)
574 .unwrap_or(column_boundaries.len())
575 - 1;
576
577 if col_a != col_b {
578 col_a.cmp(&col_b)
579 } else {
580 b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal)
582 }
583 });
584 }
585 }
586
587 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
589 let merged_fragments = self.merge_close_fragments(fragments);
591
592 let mut result = String::new();
593 let mut last_y = f64::INFINITY;
594 let mut last_x = 0.0;
595 let mut last_line_ended_with_hyphen = false;
596
597 for fragment in &merged_fragments {
598 let y_diff = (last_y - fragment.y).abs();
600 if !result.is_empty() && y_diff > self.options.newline_threshold {
601 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
603 if result.ends_with('-') {
605 result.pop();
606 }
607 } else {
608 result.push('\n');
609 }
610 } else if !result.is_empty() {
611 let x_gap = fragment.x - last_x;
613 if x_gap > self.options.space_threshold * fragment.font_size {
614 result.push(' ');
615 }
616 }
617
618 result.push_str(&fragment.text);
619 last_line_ended_with_hyphen = fragment.text.ends_with('-');
620 last_y = fragment.y;
621 last_x = fragment.x + fragment.width;
622 }
623
624 result
625 }
626
627 fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
630 if fragments.is_empty() {
631 return Vec::new();
632 }
633
634 let mut merged = Vec::new();
635 let mut current = fragments[0].clone();
636
637 for fragment in &fragments[1..] {
638 let y_diff = (current.y - fragment.y).abs();
640 let x_gap = fragment.x - (current.x + current.width);
641
642 let should_merge = y_diff < 1.0 && x_gap >= 0.0 && x_gap < fragment.font_size * 0.5; if should_merge {
649 current.text.push_str(&fragment.text);
651 current.width = (fragment.x + fragment.width) - current.x;
652 } else {
653 merged.push(current);
655 current = fragment.clone();
656 }
657 }
658
659 merged.push(current);
660 merged
661 }
662
663 fn extract_font_resources<R: Read + Seek>(
665 &mut self,
666 page: &ParsedPage,
667 document: &PdfDocument<R>,
668 ) -> ParseResult<()> {
669 self.font_cache.clear();
671
672 if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
675 if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
676 {
677 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
678 for (font_name, font_obj) in font_dict.0.iter() {
680 if let Some(font_ref) = font_obj.as_reference() {
681 if let Ok(PdfObject::Dictionary(font_dict)) =
682 document.get_object(font_ref.0, font_ref.1)
683 {
684 let mut cmap_extractor: CMapTextExtractor<R> =
686 CMapTextExtractor::new();
687
688 if let Ok(font_info) =
689 cmap_extractor.extract_font_info(&font_dict, document)
690 {
691 let has_to_unicode = font_info.to_unicode.is_some();
692 self.font_cache.insert(font_name.0.clone(), font_info);
693 tracing::debug!(
694 "Cached font: {} (ToUnicode: {})",
695 font_name.0,
696 has_to_unicode
697 );
698 }
699 }
700 }
701 }
702 }
703 }
704 } else if let Some(resources) = page.get_resources() {
705 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
707 for (font_name, font_obj) in font_dict.0.iter() {
708 if let Some(font_ref) = font_obj.as_reference() {
709 if let Ok(PdfObject::Dictionary(font_dict)) =
710 document.get_object(font_ref.0, font_ref.1)
711 {
712 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
713
714 if let Ok(font_info) =
715 cmap_extractor.extract_font_info(&font_dict, document)
716 {
717 let has_to_unicode = font_info.to_unicode.is_some();
718 self.font_cache.insert(font_name.0.clone(), font_info);
719 tracing::debug!(
720 "Cached font: {} (ToUnicode: {})",
721 font_name.0,
722 has_to_unicode
723 );
724 }
725 }
726 }
727 }
728 }
729 }
730
731 Ok(())
732 }
733
734 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
736 use crate::text::encoding::TextEncoding;
737
738 if let Some(ref font_name) = state.font_name {
740 if let Some(font_info) = self.font_cache.get(font_name) {
741 let cmap_extractor: CMapTextExtractor<std::fs::File> = CMapTextExtractor::new();
743
744 if let Ok(decoded) = cmap_extractor.decode_text_with_font(text, font_info) {
746 if !decoded.trim().is_empty()
748 && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
749 {
750 tracing::debug!(
751 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
752 font_name,
753 text,
754 decoded
755 );
756 return Ok(decoded);
757 }
758 }
759
760 tracing::debug!(
761 "CMap decoding failed or produced garbage for font {}, falling back to encoding",
762 font_name
763 );
764 }
765 }
766
767 let encoding = if let Some(ref font_name) = state.font_name {
769 match font_name.to_lowercase().as_str() {
770 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
771 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
772 name if name.contains("standard") => TextEncoding::StandardEncoding,
773 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
774 _ => {
775 if font_name.starts_with("Times")
777 || font_name.starts_with("Helvetica")
778 || font_name.starts_with("Courier")
779 {
780 TextEncoding::WinAnsiEncoding } else {
782 TextEncoding::PdfDocEncoding }
784 }
785 }
786 } else {
787 TextEncoding::WinAnsiEncoding };
789
790 let fallback_result = encoding.decode(text);
791 tracing::debug!(
792 "Fallback encoding decoding: {:?} -> \"{}\"",
793 text,
794 fallback_result
795 );
796 Ok(fallback_result)
797 }
798}
799
800impl Default for TextExtractor {
801 fn default() -> Self {
802 Self::new()
803 }
804}
805
806fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
808 [
809 a[0] * b[0] + a[1] * b[2],
810 a[0] * b[1] + a[1] * b[3],
811 a[2] * b[0] + a[3] * b[2],
812 a[2] * b[1] + a[3] * b[3],
813 a[4] * b[0] + a[5] * b[2] + b[4],
814 a[4] * b[1] + a[5] * b[3] + b[5],
815 ]
816}
817
818fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
820 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
821 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
822 (tx, ty)
823}
824
825fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
827 if let Some(font) = font_info {
829 if let Some(ref widths) = font.metrics.widths {
830 let first_char = font.metrics.first_char.unwrap_or(0);
831 let last_char = font.metrics.last_char.unwrap_or(255);
832 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
833
834 let mut total_width = 0.0;
835 let chars: Vec<char> = text.chars().collect();
836
837 for (i, &ch) in chars.iter().enumerate() {
838 let char_code = ch as u32;
839
840 let width = if char_code >= first_char && char_code <= last_char {
842 let index = (char_code - first_char) as usize;
843 widths.get(index).copied().unwrap_or(missing_width)
844 } else {
845 missing_width
846 };
847
848 total_width += width / 1000.0 * font_size;
850
851 if let Some(ref kerning) = font.metrics.kerning {
853 if i + 1 < chars.len() {
854 let next_char = chars[i + 1] as u32;
855 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
856 total_width += kern_value / 1000.0 * font_size;
858 }
859 }
860 }
861 }
862
863 return total_width;
864 }
865 }
866
867 text.len() as f64 * font_size * 0.5
869}
870
871#[cfg(test)]
872mod tests {
873 use super::*;
874
875 #[test]
876 fn test_matrix_multiplication() {
877 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
878 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
879
880 let result = multiply_matrix(&identity, &translation);
881 assert_eq!(result, translation);
882
883 let result2 = multiply_matrix(&translation, &identity);
884 assert_eq!(result2, translation);
885 }
886
887 #[test]
888 fn test_transform_point() {
889 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
890 let (x, y) = transform_point(5.0, 5.0, &translation);
891 assert_eq!(x, 15.0);
892 assert_eq!(y, 25.0);
893 }
894
895 #[test]
896 fn test_extraction_options_default() {
897 let options = ExtractionOptions::default();
898 assert!(!options.preserve_layout);
899 assert_eq!(options.space_threshold, 0.2);
900 assert_eq!(options.newline_threshold, 10.0);
901 assert!(options.sort_by_position);
902 assert!(!options.detect_columns);
903 assert_eq!(options.column_threshold, 50.0);
904 assert!(options.merge_hyphenated);
905 }
906
907 #[test]
908 fn test_extraction_options_custom() {
909 let options = ExtractionOptions {
910 preserve_layout: true,
911 space_threshold: 0.5,
912 newline_threshold: 15.0,
913 sort_by_position: false,
914 detect_columns: true,
915 column_threshold: 75.0,
916 merge_hyphenated: false,
917 };
918 assert!(options.preserve_layout);
919 assert_eq!(options.space_threshold, 0.5);
920 assert_eq!(options.newline_threshold, 15.0);
921 assert!(!options.sort_by_position);
922 assert!(options.detect_columns);
923 assert_eq!(options.column_threshold, 75.0);
924 assert!(!options.merge_hyphenated);
925 }
926
927 #[test]
928 fn test_parse_font_style_bold() {
929 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
931 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
932
933 assert_eq!(parse_font_style("Arial Bold"), (true, false));
935 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
936
937 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
939 }
940
941 #[test]
942 fn test_parse_font_style_italic() {
943 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
945 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
946
947 assert_eq!(parse_font_style("Arial Italic"), (false, true));
949 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
950
951 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
953 }
954
955 #[test]
956 fn test_parse_font_style_bold_italic() {
957 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
958 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
959 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
960 }
961
962 #[test]
963 fn test_parse_font_style_regular() {
964 assert_eq!(parse_font_style("Helvetica"), (false, false));
965 assert_eq!(parse_font_style("Times-Roman"), (false, false));
966 assert_eq!(parse_font_style("Courier"), (false, false));
967 assert_eq!(parse_font_style("Arial"), (false, false));
968 }
969
970 #[test]
971 fn test_parse_font_style_edge_cases() {
972 assert_eq!(parse_font_style(""), (false, false));
974 assert_eq!(parse_font_style("UnknownFont"), (false, false));
975
976 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
978 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
979 }
980
981 #[test]
982 fn test_text_fragment() {
983 let fragment = TextFragment {
984 text: "Hello".to_string(),
985 x: 100.0,
986 y: 200.0,
987 width: 50.0,
988 height: 12.0,
989 font_size: 10.0,
990 font_name: None,
991 is_bold: false,
992 is_italic: false,
993 color: None,
994 };
995 assert_eq!(fragment.text, "Hello");
996 assert_eq!(fragment.x, 100.0);
997 assert_eq!(fragment.y, 200.0);
998 assert_eq!(fragment.width, 50.0);
999 assert_eq!(fragment.height, 12.0);
1000 assert_eq!(fragment.font_size, 10.0);
1001 }
1002
1003 #[test]
1004 fn test_extracted_text() {
1005 let fragments = vec![
1006 TextFragment {
1007 text: "Hello".to_string(),
1008 x: 100.0,
1009 y: 200.0,
1010 width: 50.0,
1011 height: 12.0,
1012 font_size: 10.0,
1013 font_name: None,
1014 is_bold: false,
1015 is_italic: false,
1016 color: None,
1017 },
1018 TextFragment {
1019 text: "World".to_string(),
1020 x: 160.0,
1021 y: 200.0,
1022 width: 50.0,
1023 height: 12.0,
1024 font_size: 10.0,
1025 font_name: None,
1026 is_bold: false,
1027 is_italic: false,
1028 color: None,
1029 },
1030 ];
1031
1032 let extracted = ExtractedText {
1033 text: "Hello World".to_string(),
1034 fragments: fragments.clone(),
1035 };
1036
1037 assert_eq!(extracted.text, "Hello World");
1038 assert_eq!(extracted.fragments.len(), 2);
1039 assert_eq!(extracted.fragments[0].text, "Hello");
1040 assert_eq!(extracted.fragments[1].text, "World");
1041 }
1042
1043 #[test]
1044 fn test_text_state_default() {
1045 let state = TextState::default();
1046 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1047 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1048 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
1049 assert_eq!(state.leading, 0.0);
1050 assert_eq!(state.char_space, 0.0);
1051 assert_eq!(state.word_space, 0.0);
1052 assert_eq!(state.horizontal_scale, 100.0);
1053 assert_eq!(state.text_rise, 0.0);
1054 assert_eq!(state.font_size, 0.0);
1055 assert!(state.font_name.is_none());
1056 assert_eq!(state.render_mode, 0);
1057 }
1058
1059 #[test]
1060 fn test_matrix_operations() {
1061 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; let (x, y) = transform_point(1.0, 0.0, &rotation);
1064 assert_eq!(x, 0.0);
1065 assert_eq!(y, 1.0);
1066
1067 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
1069 let (x, y) = transform_point(5.0, 5.0, &scale);
1070 assert_eq!(x, 10.0);
1071 assert_eq!(y, 15.0);
1072
1073 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
1075 let (x, y) = transform_point(1.0, 1.0, &complex);
1076 assert_eq!(x, 13.0); assert_eq!(y, 23.0); }
1079
1080 #[test]
1081 fn test_text_extractor_new() {
1082 let extractor = TextExtractor::new();
1083 let options = extractor.options;
1084 assert!(!options.preserve_layout);
1085 assert_eq!(options.space_threshold, 0.2);
1086 assert_eq!(options.newline_threshold, 10.0);
1087 assert!(options.sort_by_position);
1088 assert!(!options.detect_columns);
1089 assert_eq!(options.column_threshold, 50.0);
1090 assert!(options.merge_hyphenated);
1091 }
1092
1093 #[test]
1094 fn test_text_extractor_with_options() {
1095 let options = ExtractionOptions {
1096 preserve_layout: true,
1097 space_threshold: 0.3,
1098 newline_threshold: 12.0,
1099 sort_by_position: false,
1100 detect_columns: true,
1101 column_threshold: 60.0,
1102 merge_hyphenated: false,
1103 };
1104 let extractor = TextExtractor::with_options(options.clone());
1105 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
1106 assert_eq!(extractor.options.space_threshold, options.space_threshold);
1107 assert_eq!(
1108 extractor.options.newline_threshold,
1109 options.newline_threshold
1110 );
1111 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
1112 assert_eq!(extractor.options.detect_columns, options.detect_columns);
1113 assert_eq!(extractor.options.column_threshold, options.column_threshold);
1114 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
1115 }
1116
1117 #[test]
1122 fn test_calculate_text_width_with_no_font_info() {
1123 let width = calculate_text_width("Hello", 12.0, None);
1125
1126 assert_eq!(
1128 width, 30.0,
1129 "Without font info, should use simplified calculation: len * font_size * 0.5"
1130 );
1131 }
1132
1133 #[test]
1134 fn test_calculate_text_width_with_empty_metrics() {
1135 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1136
1137 let font_info = FontInfo {
1139 name: "TestFont".to_string(),
1140 font_type: "Type1".to_string(),
1141 encoding: None,
1142 to_unicode: None,
1143 differences: None,
1144 descendant_font: None,
1145 cid_to_gid_map: None,
1146 metrics: FontMetrics {
1147 first_char: None,
1148 last_char: None,
1149 widths: None,
1150 missing_width: Some(500.0),
1151 kerning: None,
1152 },
1153 };
1154
1155 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1156
1157 assert_eq!(
1159 width, 30.0,
1160 "Without widths array, should fall back to simplified calculation"
1161 );
1162 }
1163
1164 #[test]
1165 fn test_calculate_text_width_with_complete_metrics() {
1166 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1167
1168 let mut widths = vec![0.0; 95]; widths[72 - 32] = 722.0; widths[101 - 32] = 556.0; widths[108 - 32] = 278.0; widths[111 - 32] = 611.0; let font_info = FontInfo {
1179 name: "Helvetica".to_string(),
1180 font_type: "Type1".to_string(),
1181 encoding: None,
1182 to_unicode: None,
1183 differences: None,
1184 descendant_font: None,
1185 cid_to_gid_map: None,
1186 metrics: FontMetrics {
1187 first_char: Some(32),
1188 last_char: Some(126),
1189 widths: Some(widths),
1190 missing_width: Some(500.0),
1191 kerning: None,
1192 },
1193 };
1194
1195 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
1196
1197 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
1205 let tolerance = 0.0001; assert!(
1207 (width - expected).abs() < tolerance,
1208 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
1209 expected,
1210 width,
1211 (width - expected).abs()
1212 );
1213
1214 let simplified = 5.0 * 12.0 * 0.5; assert_ne!(
1217 width, simplified,
1218 "Metrics-based calculation should differ from simplified (30.0)"
1219 );
1220 }
1221
1222 #[test]
1223 fn test_calculate_text_width_character_outside_range() {
1224 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1225
1226 let widths = vec![722.0; 26]; let font_info = FontInfo {
1230 name: "TestFont".to_string(),
1231 font_type: "Type1".to_string(),
1232 encoding: None,
1233 to_unicode: None,
1234 differences: None,
1235 descendant_font: None,
1236 cid_to_gid_map: None,
1237 metrics: FontMetrics {
1238 first_char: Some(65), last_char: Some(90), widths: Some(widths),
1241 missing_width: Some(500.0),
1242 kerning: None,
1243 },
1244 };
1245
1246 let width = calculate_text_width("A1", 10.0, Some(&font_info));
1248
1249 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
1254 assert_eq!(
1255 width, expected,
1256 "Should use missing_width for characters outside range"
1257 );
1258 }
1259
1260 #[test]
1261 fn test_calculate_text_width_missing_width_in_array() {
1262 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1263
1264 let mut widths = vec![500.0; 95]; widths[10] = 0.0; let font_info = FontInfo {
1269 name: "TestFont".to_string(),
1270 font_type: "Type1".to_string(),
1271 encoding: None,
1272 to_unicode: None,
1273 differences: None,
1274 descendant_font: None,
1275 cid_to_gid_map: None,
1276 metrics: FontMetrics {
1277 first_char: Some(32),
1278 last_char: Some(126),
1279 widths: Some(widths),
1280 missing_width: Some(600.0),
1281 kerning: None,
1282 },
1283 };
1284
1285 let char_code = 42u8 as char; let text = char_code.to_string();
1288 let width = calculate_text_width(&text, 10.0, Some(&font_info));
1289
1290 assert_eq!(
1293 width, 0.0,
1294 "Should use 0.0 width from array, not missing_width"
1295 );
1296 }
1297
1298 #[test]
1299 fn test_calculate_text_width_empty_string() {
1300 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1301
1302 let font_info = FontInfo {
1303 name: "TestFont".to_string(),
1304 font_type: "Type1".to_string(),
1305 encoding: None,
1306 to_unicode: None,
1307 differences: None,
1308 descendant_font: None,
1309 cid_to_gid_map: None,
1310 metrics: FontMetrics {
1311 first_char: Some(32),
1312 last_char: Some(126),
1313 widths: Some(vec![500.0; 95]),
1314 missing_width: Some(500.0),
1315 kerning: None,
1316 },
1317 };
1318
1319 let width = calculate_text_width("", 12.0, Some(&font_info));
1320 assert_eq!(width, 0.0, "Empty string should have zero width");
1321
1322 let width_no_font = calculate_text_width("", 12.0, None);
1324 assert_eq!(
1325 width_no_font, 0.0,
1326 "Empty string should have zero width (no font)"
1327 );
1328 }
1329
1330 #[test]
1331 fn test_calculate_text_width_unicode_characters() {
1332 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1333
1334 let font_info = FontInfo {
1336 name: "TestFont".to_string(),
1337 font_type: "Type1".to_string(),
1338 encoding: None,
1339 to_unicode: None,
1340 differences: None,
1341 descendant_font: None,
1342 cid_to_gid_map: None,
1343 metrics: FontMetrics {
1344 first_char: Some(32),
1345 last_char: Some(126),
1346 widths: Some(vec![500.0; 95]),
1347 missing_width: Some(600.0),
1348 kerning: None,
1349 },
1350 };
1351
1352 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
1354
1355 assert_eq!(
1358 width, 6.0,
1359 "Unicode character outside range should use missing_width"
1360 );
1361 }
1362
1363 #[test]
1364 fn test_calculate_text_width_different_font_sizes() {
1365 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1366
1367 let font_info = FontInfo {
1368 name: "TestFont".to_string(),
1369 font_type: "Type1".to_string(),
1370 encoding: None,
1371 to_unicode: None,
1372 differences: None,
1373 descendant_font: None,
1374 cid_to_gid_map: None,
1375 metrics: FontMetrics {
1376 first_char: Some(65), last_char: Some(65), widths: Some(vec![722.0]),
1379 missing_width: Some(500.0),
1380 kerning: None,
1381 },
1382 };
1383
1384 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
1386 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
1387
1388 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
1390 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
1391 assert_eq!(
1392 width_20,
1393 width_10 * 2.0,
1394 "Width should scale linearly with font size"
1395 );
1396 }
1397
1398 #[test]
1399 fn test_calculate_text_width_proportional_vs_monospace() {
1400 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1401
1402 let proportional_widths = vec![278.0, 556.0, 722.0]; let proportional_font = FontInfo {
1405 name: "Helvetica".to_string(),
1406 font_type: "Type1".to_string(),
1407 encoding: None,
1408 to_unicode: None,
1409 differences: None,
1410 descendant_font: None,
1411 cid_to_gid_map: None,
1412 metrics: FontMetrics {
1413 first_char: Some(105), last_char: Some(107), widths: Some(proportional_widths),
1416 missing_width: Some(500.0),
1417 kerning: None,
1418 },
1419 };
1420
1421 let monospace_widths = vec![600.0, 600.0, 600.0];
1423 let monospace_font = FontInfo {
1424 name: "Courier".to_string(),
1425 font_type: "Type1".to_string(),
1426 encoding: None,
1427 to_unicode: None,
1428 differences: None,
1429 descendant_font: None,
1430 cid_to_gid_map: None,
1431 metrics: FontMetrics {
1432 first_char: Some(105),
1433 last_char: Some(107),
1434 widths: Some(monospace_widths),
1435 missing_width: Some(600.0),
1436 kerning: None,
1437 },
1438 };
1439
1440 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
1441 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
1442
1443 assert!(
1445 prop_width < mono_width,
1446 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
1447 prop_width,
1448 mono_width
1449 );
1450 }
1451
1452 #[test]
1457 fn test_calculate_text_width_with_kerning() {
1458 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
1459 use std::collections::HashMap;
1460
1461 let mut widths = vec![500.0; 95]; widths[65 - 32] = 722.0; widths[86 - 32] = 722.0; widths[87 - 32] = 944.0; let mut kerning = HashMap::new();
1468 kerning.insert((65, 86), -50.0); kerning.insert((65, 87), -40.0); let font_info = FontInfo {
1473 name: "Helvetica".to_string(),
1474 font_type: "Type1".to_string(),
1475 encoding: None,
1476 to_unicode: None,
1477 differences: None,
1478 descendant_font: None,
1479 cid_to_gid_map: None,
1480 metrics: FontMetrics {
1481 first_char: Some(32),
1482 last_char: Some(126),
1483 widths: Some(widths),
1484 missing_width: Some(500.0),
1485 kerning: Some(kerning),
1486 },
1487 };
1488
1489 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
1491 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
1494 let tolerance = 0.0001;
1495 assert!(
1496 (width_av - expected_av).abs() < tolerance,
1497 "AV with kerning: expected {}, got {}, diff {}",
1498 expected_av,
1499 width_av,
1500 (width_av - expected_av).abs()
1501 );
1502
1503 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
1505 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
1508 assert!(
1509 (width_aw - expected_aw).abs() < tolerance,
1510 "AW with kerning: expected {}, got {}, diff {}",
1511 expected_aw,
1512 width_aw,
1513 (width_aw - expected_aw).abs()
1514 );
1515
1516 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
1518 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
1520 assert!(
1521 (width_va - expected_va).abs() < tolerance,
1522 "VA without kerning: expected {}, got {}, diff {}",
1523 expected_va,
1524 width_va,
1525 (width_va - expected_va).abs()
1526 );
1527
1528 assert!(
1530 width_av < width_va,
1531 "AV with kerning ({}) should be narrower than VA without kerning ({})",
1532 width_av,
1533 width_va
1534 );
1535 }
1536
1537 #[test]
1538 fn test_parse_truetype_kern_table_minimal() {
1539 use crate::text::extraction_cmap::parse_truetype_kern_table;
1540
1541 let mut ttf_data = vec![
1549 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20, 0x00, 0x01, 0x00, 0x00, ];
1556
1557 ttf_data.extend_from_slice(b"head"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); ttf_data.extend_from_slice(b"kern"); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); ttf_data.extend_from_slice(&[0u8; 54]);
1571
1572 ttf_data.extend_from_slice(&[
1574 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x41, 0x00, 0x56, 0xFF, 0xCE, 0x00, 0x41, 0x00, 0x57, 0xFF, 0xD8, ]);
1594
1595 let result = parse_truetype_kern_table(&ttf_data);
1596 assert!(
1597 result.is_ok(),
1598 "Should parse minimal kern table successfully: {:?}",
1599 result.err()
1600 );
1601
1602 let kerning_map = result.unwrap();
1603 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
1604
1605 assert_eq!(
1607 kerning_map.get(&(65, 86)),
1608 Some(&-50.0),
1609 "Should have A+V kerning pair with value -50"
1610 );
1611
1612 assert_eq!(
1614 kerning_map.get(&(65, 87)),
1615 Some(&-40.0),
1616 "Should have A+W kerning pair with value -40"
1617 );
1618 }
1619
1620 #[test]
1621 fn test_parse_kern_table_no_kern_table() {
1622 use crate::text::extraction_cmap::extract_truetype_kerning;
1623
1624 let ttf_data = vec![
1629 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, b'h', b'e', b'a', b'd', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1642 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1643 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1644 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1645 ];
1646
1647 let result = extract_truetype_kerning(&ttf_data);
1648 assert!(
1649 result.is_ok(),
1650 "Should gracefully handle missing kern table"
1651 );
1652
1653 let kerning_map = result.unwrap();
1654 assert!(
1655 kerning_map.is_empty(),
1656 "Should return empty HashMap when no kern table exists"
1657 );
1658 }
1659}