1pub mod format;
2pub mod layout;
3pub mod search;
4pub mod text_layout;
5
6use std::collections::HashMap;
7
8use crate::content::{ContentOp, Operand, parse_content_stream};
9use crate::error::Result;
10use crate::font::{Encoding, FontInfo, ToUnicodeCMap, decode_text, parse_font_info};
11use crate::object::{PdfDict, PdfObject};
12use crate::page::{PageInfo, collect_pages};
13use crate::parser::PdfDocument;
14
15#[derive(Debug, Clone, Copy)]
20pub struct Matrix {
21 pub a: f64,
22 pub b: f64,
23 pub c: f64,
24 pub d: f64,
25 pub e: f64,
26 pub f: f64,
27}
28
29impl Matrix {
30 pub fn identity() -> Self {
31 Self {
32 a: 1.0,
33 b: 0.0,
34 c: 0.0,
35 d: 1.0,
36 e: 0.0,
37 f: 0.0,
38 }
39 }
40
41 pub fn concat(&self, other: &Matrix) -> Matrix {
43 Matrix {
44 a: self.a * other.a + self.b * other.c,
45 b: self.a * other.b + self.b * other.d,
46 c: self.c * other.a + self.d * other.c,
47 d: self.c * other.b + self.d * other.d,
48 e: self.e * other.a + self.f * other.c + other.e,
49 f: self.e * other.b + self.f * other.d + other.f,
50 }
51 }
52
53 pub fn transform_point(&self, x: f64, y: f64) -> (f64, f64) {
55 (
56 self.a * x + self.c * y + self.e,
57 self.b * x + self.d * y + self.f,
58 )
59 }
60
61 pub fn translate(tx: f64, ty: f64) -> Self {
63 Self {
64 a: 1.0,
65 b: 0.0,
66 c: 0.0,
67 d: 1.0,
68 e: tx,
69 f: ty,
70 }
71 }
72
73 pub fn font_size_scale(&self) -> f64 {
75 (self.b * self.b + self.d * self.d).sqrt()
76 }
77}
78
79#[derive(Debug, Clone)]
84struct ResolvedFont {
85 info: FontInfo,
86 cmap: Option<ToUnicodeCMap>,
87}
88
89impl ResolvedFont {
90 fn char_width(&self, code: u32) -> f64 {
92 self.info.widths.get_width(code)
93 }
94}
95
96#[derive(Debug, Clone)]
101struct TextState {
102 char_spacing: f64,
104 word_spacing: f64,
106 horiz_scaling: f64,
108 leading: f64,
110 font_name: Vec<u8>,
112 font_size: f64,
114 text_rise: f64,
116}
117
118impl Default for TextState {
119 fn default() -> Self {
120 Self {
121 char_spacing: 0.0,
122 word_spacing: 0.0,
123 horiz_scaling: 1.0,
124 leading: 0.0,
125 font_name: Vec::new(),
126 font_size: 12.0,
127 text_rise: 0.0,
128 }
129 }
130}
131
132#[derive(Debug, Clone)]
137struct GraphicsState {
138 ctm: Matrix,
139 text: TextState,
140}
141
142impl Default for GraphicsState {
143 fn default() -> Self {
144 Self {
145 ctm: Matrix::identity(),
146 text: TextState::default(),
147 }
148 }
149}
150
151#[derive(Debug, Clone)]
157pub struct TextChar {
158 pub unicode: String,
160 pub x: f64,
162 pub y: f64,
164 pub font_size: f64,
166 pub font_name: String,
168 pub width: f64,
170}
171
172#[derive(Debug, Clone)]
174pub struct TextWord {
175 pub text: String,
176 pub x: f64,
177 pub y: f64,
178 pub width: f64,
179 pub font_size: f64,
180}
181
182#[derive(Debug, Clone)]
184pub struct TextLine {
185 pub text: String,
186 pub words: Vec<TextWord>,
187 pub x: f64,
188 pub y: f64,
189}
190
191#[derive(Debug, Clone)]
193pub struct TextBlock {
194 pub text: String,
195 pub lines: Vec<TextLine>,
196}
197
198#[derive(Debug, Clone)]
200pub struct PageText {
201 pub page_index: usize,
202 pub chars: Vec<TextChar>,
203 pub lines: Vec<TextLine>,
204 pub blocks: Vec<TextBlock>,
205}
206
207impl PageText {
208 pub fn plain_text(&self) -> String {
210 self.blocks
211 .iter()
212 .map(|b| b.text.as_str())
213 .collect::<Vec<_>>()
214 .join("\n\n")
215 }
216}
217
218struct TextInterpreter {
223 gs_stack: Vec<GraphicsState>,
225 gs: GraphicsState,
227 tm: Matrix,
229 tlm: Matrix,
231 in_text: bool,
233 fonts: HashMap<Vec<u8>, ResolvedFont>,
235 chars: Vec<TextChar>,
237}
238
239impl TextInterpreter {
240 fn new(fonts: HashMap<Vec<u8>, ResolvedFont>) -> Self {
241 Self {
242 gs_stack: Vec::new(),
243 gs: GraphicsState::default(),
244 tm: Matrix::identity(),
245 tlm: Matrix::identity(),
246 in_text: false,
247 fonts,
248 chars: Vec::new(),
249 }
250 }
251
252 fn run(mut self, ops: &[ContentOp]) -> Vec<TextChar> {
253 for op in ops {
254 self.process_op(op);
255 }
256 self.chars
257 }
258
259 fn process_op(&mut self, op: &ContentOp) {
260 match op.operator.as_slice() {
261 b"q" => self.gs_stack.push(self.gs.clone()),
263 b"Q" => {
264 if let Some(gs) = self.gs_stack.pop() {
265 self.gs = gs;
266 }
267 }
268 b"cm" => {
269 if let Some(m) = self.read_matrix(&op.operands) {
270 self.gs.ctm = m.concat(&self.gs.ctm);
271 }
272 }
273
274 b"Tc" => {
276 if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
277 self.gs.text.char_spacing = v;
278 }
279 }
280 b"Tw" => {
281 if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
282 self.gs.text.word_spacing = v;
283 }
284 }
285 b"Tz" => {
286 if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
287 self.gs.text.horiz_scaling = v / 100.0;
288 }
289 }
290 b"TL" => {
291 if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
292 self.gs.text.leading = v;
293 }
294 }
295 b"Tf" => {
296 if op.operands.len() >= 2 {
297 if let Some(name) = op.operands[0].as_name() {
298 self.gs.text.font_name = name.to_vec();
299 }
300 if let Some(size) = op.operands[1].as_f64() {
301 self.gs.text.font_size = size;
302 }
303 }
304 }
305 b"Ts" => {
306 if let Some(v) = op.operands.first().and_then(|o| o.as_f64()) {
307 self.gs.text.text_rise = v;
308 }
309 }
310
311 b"BT" => {
313 self.in_text = true;
314 self.tm = Matrix::identity();
315 self.tlm = Matrix::identity();
316 }
317 b"ET" => {
318 self.in_text = false;
319 }
320
321 b"Td" => {
323 if op.operands.len() >= 2 {
324 let tx = op.operands[0].as_f64().unwrap_or(0.0);
325 let ty = op.operands[1].as_f64().unwrap_or(0.0);
326 self.tlm = Matrix::translate(tx, ty).concat(&self.tlm);
327 self.tm = self.tlm;
328 }
329 }
330 b"TD" => {
331 if op.operands.len() >= 2 {
332 let tx = op.operands[0].as_f64().unwrap_or(0.0);
333 let ty = op.operands[1].as_f64().unwrap_or(0.0);
334 self.gs.text.leading = -ty;
335 self.tlm = Matrix::translate(tx, ty).concat(&self.tlm);
336 self.tm = self.tlm;
337 }
338 }
339 b"Tm" => {
340 if let Some(m) = self.read_matrix(&op.operands) {
341 self.tm = m;
342 self.tlm = m;
343 }
344 }
345 b"T*" => {
346 let tl = self.gs.text.leading;
347 self.tlm = Matrix::translate(0.0, -tl).concat(&self.tlm);
348 self.tm = self.tlm;
349 }
350
351 b"Tj" => {
353 if let Some(s) = op.operands.first().and_then(|o| o.as_str()) {
354 self.show_string(s);
355 }
356 }
357 b"TJ" => {
358 if let Some(arr) = op.operands.first().and_then(|o| o.as_array()) {
359 self.show_tj_array(arr);
360 }
361 }
362 b"'" => {
363 let tl = self.gs.text.leading;
365 self.tlm = Matrix::translate(0.0, -tl).concat(&self.tlm);
366 self.tm = self.tlm;
367 if let Some(s) = op.operands.first().and_then(|o| o.as_str()) {
368 self.show_string(s);
369 }
370 }
371 b"\"" => {
372 if op.operands.len() >= 3 {
374 if let Some(aw) = op.operands[0].as_f64() {
375 self.gs.text.word_spacing = aw;
376 }
377 if let Some(ac) = op.operands[1].as_f64() {
378 self.gs.text.char_spacing = ac;
379 }
380 let tl = self.gs.text.leading;
381 self.tlm = Matrix::translate(0.0, -tl).concat(&self.tlm);
382 self.tm = self.tlm;
383 if let Some(s) = op.operands[2].as_str() {
384 self.show_string(s);
385 }
386 }
387 }
388
389 b"gs" => {
391 }
393
394 _ => {} }
396 }
397
398 fn show_string(&mut self, raw: &[u8]) {
400 let font = self.fonts.get(&self.gs.text.font_name);
401
402 let is_two_byte = font
403 .map(|f| {
404 matches!(f.info.encoding, Encoding::Identity) || f.info.subtype == b"Type0"
405 })
406 .unwrap_or(false);
407
408 let tfs = self.gs.text.font_size;
409 let tc = self.gs.text.char_spacing;
410 let tw = self.gs.text.word_spacing;
411 let th = self.gs.text.horiz_scaling;
412 let rise = self.gs.text.text_rise;
413
414 let mut i = 0;
416 while i < raw.len() {
417 let (code, byte_len) = if is_two_byte && i + 1 < raw.len() {
418 (((raw[i] as u32) << 8) | raw[i + 1] as u32, 2)
419 } else {
420 (raw[i] as u32, 1)
421 };
422
423 let unicode = if let Some(f) = font {
425 if let Some(ref cmap) = f.cmap {
426 cmap.lookup(code)
427 .unwrap_or_else(|| decode_text(&raw[i..i + byte_len], f.info.encoding))
428 } else {
429 decode_text(&raw[i..i + byte_len], f.info.encoding)
430 }
431 } else {
432 String::from_utf8_lossy(&raw[i..i + byte_len]).into_owned()
433 };
434
435 let w0 = font
437 .map(|f| f.char_width(code))
438 .unwrap_or(500.0);
439
440 let trm = self.text_rendering_matrix(tfs, th, rise);
443 let (x, y) = trm.transform_point(0.0, 0.0);
444 let effective_size = trm.font_size_scale();
445
446 let tx = w0 / 1000.0 * tfs;
448 let advance = (tx + tc) * th;
449
450 let total_advance = if code == 32 {
452 advance + tw * th
453 } else {
454 advance
455 };
456
457 let width = (w0 / 1000.0 * tfs * th).abs();
459
460 if !unicode.is_empty() {
462 self.chars.push(TextChar {
463 unicode,
464 x,
465 y,
466 font_size: effective_size,
467 font_name: String::from_utf8_lossy(&self.gs.text.font_name).into_owned(),
468 width,
469 });
470 }
471
472 self.tm = Matrix::translate(total_advance, 0.0).concat(&self.tm);
474
475 i += byte_len;
476 }
477 }
478
479 fn show_tj_array(&mut self, items: &[Operand]) {
481 let th = self.gs.text.horiz_scaling;
482 let tfs = self.gs.text.font_size;
483
484 for item in items {
485 match item {
486 Operand::String(s) => {
487 self.show_string(s);
488 }
489 Operand::Integer(n) => {
490 let displacement = -*n as f64 / 1000.0 * tfs * th;
492 self.tm = Matrix::translate(displacement, 0.0).concat(&self.tm);
493 }
494 Operand::Real(n) => {
495 let displacement = -n / 1000.0 * tfs * th;
496 self.tm = Matrix::translate(displacement, 0.0).concat(&self.tm);
497 }
498 _ => {}
499 }
500 }
501 }
502
503 fn text_rendering_matrix(&self, tfs: f64, th: f64, rise: f64) -> Matrix {
505 let text_state = Matrix {
506 a: tfs * th,
507 b: 0.0,
508 c: 0.0,
509 d: tfs,
510 e: 0.0,
511 f: rise,
512 };
513 text_state.concat(&self.tm).concat(&self.gs.ctm)
514 }
515
516 fn read_matrix(&self, operands: &[Operand]) -> Option<Matrix> {
517 if operands.len() < 6 {
518 return None;
519 }
520 Some(Matrix {
521 a: operands[0].as_f64()?,
522 b: operands[1].as_f64()?,
523 c: operands[2].as_f64()?,
524 d: operands[3].as_f64()?,
525 e: operands[4].as_f64()?,
526 f: operands[5].as_f64()?,
527 })
528 }
529}
530
531fn group_into_words(chars: &[TextChar]) -> Vec<TextWord> {
537 if chars.is_empty() {
538 return Vec::new();
539 }
540
541 let mut words: Vec<TextWord> = Vec::new();
542 let mut current_text = String::new();
543 let mut word_x = chars[0].x;
544 let mut word_y = chars[0].y;
545 let mut word_end_x = chars[0].x;
546 let mut word_font_size = chars[0].font_size;
547
548 for (i, ch) in chars.iter().enumerate() {
549 if i > 0 {
550 let prev = &chars[i - 1];
551 let expected_x = prev.x + prev.width;
552 let gap = (ch.x - expected_x).abs();
553 let y_diff = (ch.y - prev.y).abs();
554 let threshold = prev.font_size * 0.3;
555
556 if gap > threshold || y_diff > prev.font_size * 0.5 {
558 if !current_text.is_empty() {
559 words.push(TextWord {
560 text: current_text.trim().to_string(),
561 x: word_x,
562 y: word_y,
563 width: word_end_x - word_x,
564 font_size: word_font_size,
565 });
566 }
567 current_text = String::new();
568 word_x = ch.x;
569 word_y = ch.y;
570 word_font_size = ch.font_size;
571 }
572 }
573
574 if ch.unicode == " " {
576 if !current_text.is_empty() {
577 words.push(TextWord {
578 text: current_text.trim().to_string(),
579 x: word_x,
580 y: word_y,
581 width: word_end_x - word_x,
582 font_size: word_font_size,
583 });
584 current_text = String::new();
585 word_x = ch.x + ch.width;
586 word_y = ch.y;
587 word_font_size = ch.font_size;
588 }
589 } else {
590 current_text.push_str(&ch.unicode);
591 word_end_x = ch.x + ch.width;
592 }
593 }
594
595 if !current_text.is_empty() {
597 words.push(TextWord {
598 text: current_text.trim().to_string(),
599 x: word_x,
600 y: word_y,
601 width: word_end_x - word_x,
602 font_size: word_font_size,
603 });
604 }
605
606 words.retain(|w| !w.text.is_empty());
608 words
609}
610
611fn group_into_lines(words: &[TextWord]) -> Vec<TextLine> {
613 if words.is_empty() {
614 return Vec::new();
615 }
616
617 let mut sorted: Vec<&TextWord> = words.iter().collect();
619 sorted.sort_by(|a, b| {
620 let y_cmp = b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal);
621 if y_cmp == std::cmp::Ordering::Equal {
622 a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal)
623 } else {
624 y_cmp
625 }
626 });
627
628 let mut lines: Vec<TextLine> = Vec::new();
629 let mut current_line_words: Vec<TextWord> = Vec::new();
630 let mut line_y = sorted[0].y;
631
632 for word in sorted {
633 let y_threshold = word.font_size * 0.5;
634 if (word.y - line_y).abs() > y_threshold && !current_line_words.is_empty() {
635 lines.push(build_line(std::mem::take(&mut current_line_words)));
637 line_y = word.y;
638 }
639 current_line_words.push(word.clone());
640 if current_line_words.len() == 1 {
641 line_y = word.y;
642 }
643 }
644
645 if !current_line_words.is_empty() {
646 lines.push(build_line(current_line_words));
647 }
648
649 lines
650}
651
652fn build_line(mut words: Vec<TextWord>) -> TextLine {
653 words.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal));
655
656 let text = words
657 .iter()
658 .map(|w| w.text.as_str())
659 .collect::<Vec<_>>()
660 .join(" ");
661
662 let x = words.first().map(|w| w.x).unwrap_or(0.0);
663 let y = words.first().map(|w| w.y).unwrap_or(0.0);
664
665 TextLine {
666 text,
667 x,
668 y,
669 words,
670 }
671}
672
673fn resolve_fonts(
679 doc: &PdfDocument,
680 resources_ref: &Option<PdfObject>,
681) -> HashMap<Vec<u8>, ResolvedFont> {
682 let mut fonts = HashMap::new();
683
684 let resources = match resources_ref {
685 Some(PdfObject::Dict(d)) => d.clone(),
686 Some(PdfObject::Reference(r)) => {
687 let r = r.clone();
688 match doc.resolve(&r) {
689 Ok(PdfObject::Dict(d)) => d,
690 _ => return fonts,
691 }
692 }
693 _ => return fonts,
694 };
695
696 let font_dict = match resources.get(b"Font") {
697 Some(PdfObject::Dict(d)) => d.clone(),
698 Some(PdfObject::Reference(r)) => {
699 let r = r.clone();
700 match doc.resolve(&r) {
701 Ok(PdfObject::Dict(d)) => d,
702 _ => return fonts,
703 }
704 }
705 _ => return fonts,
706 };
707
708 for (name, obj) in font_dict.iter() {
709 let font_dict = match obj {
710 PdfObject::Dict(d) => d.clone(),
711 PdfObject::Reference(r) => {
712 let r = r.clone();
713 match doc.resolve(&r) {
714 Ok(PdfObject::Dict(d)) => d.clone(),
715 _ => continue,
716 }
717 }
718 _ => continue,
719 };
720
721 let mut info = parse_font_info(&font_dict);
722
723 let cmap = resolve_to_unicode(doc, &font_dict);
725 if cmap.is_some() {
726 info.to_unicode = None; }
728
729 if font_dict.get_name(b"Subtype") == Some(b"Type0") {
731 resolve_type0_descendant(doc, &font_dict, &mut info);
732 }
733
734 fonts.insert(name.clone(), ResolvedFont { info, cmap });
735 }
736
737 fonts
738}
739
740fn resolve_to_unicode(doc: &PdfDocument, font_dict: &PdfDict) -> Option<ToUnicodeCMap> {
741 let tu_obj = font_dict.get(b"ToUnicode")?;
742 match tu_obj {
743 PdfObject::Reference(r) => {
744 let r = r.clone();
745 let obj = doc.resolve(&r).ok()?;
746 match obj {
747 PdfObject::Stream { dict, data } => {
748 let decoded = doc.decode_stream(&dict, &data).ok()?;
749 Some(ToUnicodeCMap::parse(&decoded))
750 }
751 _ => None,
752 }
753 }
754 PdfObject::Stream { dict, data } => {
755 let decoded = doc.decode_stream(dict, data).ok()?;
756 Some(ToUnicodeCMap::parse(&decoded))
757 }
758 _ => None,
759 }
760}
761
762fn resolve_type0_descendant(doc: &PdfDocument, font_dict: &PdfDict, info: &mut FontInfo) {
763 let descendants = match font_dict.get(b"DescendantFonts") {
764 Some(PdfObject::Array(arr)) => arr.clone(),
765 _ => return,
766 };
767
768 let descendant_ref = match descendants.first() {
769 Some(PdfObject::Reference(r)) => r.clone(),
770 _ => return,
771 };
772
773 let descendant = match doc.resolve(&descendant_ref) {
774 Ok(PdfObject::Dict(d)) => d,
775 _ => return,
776 };
777
778 if let Some(PdfObject::Array(w_array)) = descendant.get(b"W") {
780 info.widths = parse_cid_widths(w_array);
781 }
782
783 if let Some(dw) = descendant.get(b"DW").and_then(|o| o.as_f64()) {
785 match &mut info.widths {
786 crate::font::FontWidths::CID {
787 default_width,
788 ..
789 } => *default_width = dw,
790 crate::font::FontWidths::None {
791 default_width,
792 } => *default_width = dw,
793 _ => {}
794 }
795 }
796
797 info.encoding = Encoding::Identity;
799}
800
801fn parse_cid_widths(w_array: &[PdfObject]) -> crate::font::FontWidths {
802 use crate::font::{CIDWidthEntry, FontWidths};
803
804 let mut entries = Vec::new();
805 let mut i = 0;
806
807 while i < w_array.len() {
808 let first = match w_array[i].as_i64() {
809 Some(v) => v as u32,
810 None => {
811 i += 1;
812 continue;
813 }
814 };
815 i += 1;
816
817 if i >= w_array.len() {
818 break;
819 }
820
821 match &w_array[i] {
822 PdfObject::Array(widths) => {
823 let ws: Vec<f64> = widths.iter().filter_map(|o| o.as_f64()).collect();
824 entries.push(CIDWidthEntry::List {
825 first,
826 widths: ws,
827 });
828 i += 1;
829 }
830 PdfObject::Integer(_) | PdfObject::Real(_) => {
831 if i + 1 < w_array.len() {
832 let last = w_array[i].as_f64().unwrap_or(0.0) as u32;
833 i += 1;
834 let width = w_array.get(i).and_then(|o| o.as_f64()).unwrap_or(1000.0);
835 i += 1;
836 entries.push(CIDWidthEntry::Range {
837 first,
838 last,
839 width,
840 });
841 } else {
842 break;
843 }
844 }
845 _ => {
846 i += 1;
847 }
848 }
849 }
850
851 FontWidths::CID {
852 default_width: 1000.0,
853 w_entries: entries,
854 }
855}
856
857fn get_page_content_data(doc: &PdfDocument, page: &PageInfo) -> Result<Vec<u8>> {
862 let contents_obj = match &page.contents_ref {
863 Some(obj) => obj.clone(),
864 None => return Ok(Vec::new()),
865 };
866
867 match contents_obj {
868 PdfObject::Reference(r) => {
869 let obj = doc.resolve(&r)?;
870 decode_content_obj(doc, &obj)
871 }
872 PdfObject::Array(arr) => {
873 let mut combined = Vec::new();
874 for item in &arr {
875 let data = match item {
876 PdfObject::Reference(r) => {
877 let r = r.clone();
878 let obj = doc.resolve(&r)?;
879 decode_content_obj(doc, &obj)?
880 }
881 _ => Vec::new(),
882 };
883 if !combined.is_empty() {
884 combined.push(b' ');
885 }
886 combined.extend_from_slice(&data);
887 }
888 Ok(combined)
889 }
890 PdfObject::Stream { dict, data } => doc.decode_stream(&dict, &data),
891 _ => Ok(Vec::new()),
892 }
893}
894
895fn decode_content_obj(doc: &PdfDocument, obj: &PdfObject) -> Result<Vec<u8>> {
896 match obj {
897 PdfObject::Stream { dict, data } => doc.decode_stream(dict, data),
898 _ => Ok(Vec::new()),
899 }
900}
901
902pub fn extract_page_text(doc: &PdfDocument, page: &PageInfo) -> Result<PageText> {
908 let fonts = resolve_fonts(doc, &page.resources_ref);
910
911 let content_data = get_page_content_data(doc, page)?;
913
914 if content_data.is_empty() {
915 return Ok(PageText {
916 page_index: page.index,
917 chars: Vec::new(),
918 lines: Vec::new(),
919 blocks: Vec::new(),
920 });
921 }
922
923 let ops = parse_content_stream(&content_data)?;
925
926 let interpreter = TextInterpreter::new(fonts);
928 let chars = interpreter.run(&ops);
929
930 let words = group_into_words(&chars);
932 let lines = group_into_lines(&words);
933 let blocks = layout::detect_columns_and_reorder(&lines);
935
936 Ok(PageText {
937 page_index: page.index,
938 chars,
939 lines,
940 blocks,
941 })
942}
943
944pub fn extract_all_text(doc: &PdfDocument) -> Result<Vec<PageText>> {
946 let pages = collect_pages(doc)?;
947 let mut results = Vec::with_capacity(pages.len());
948
949 for page in &pages {
950 results.push(extract_page_text(doc, page)?);
951 }
952
953 Ok(results)
954}
955
956pub fn extract_page_text_string(doc: &PdfDocument, page: &PageInfo) -> Result<String> {
958 let page_text = extract_page_text(doc, page)?;
959 Ok(page_text.plain_text())
960}
961
962pub fn extract_all_text_string(doc: &PdfDocument) -> Result<String> {
964 let pages = extract_all_text(doc)?;
965 let texts: Vec<String> = pages.iter().map(|p| p.plain_text()).collect();
966 Ok(texts.join("\n\n"))
967}
968
969#[cfg(test)]
974mod tests {
975 use super::*;
976
977 #[test]
978 fn test_matrix_identity() {
979 let m = Matrix::identity();
980 let (x, y) = m.transform_point(10.0, 20.0);
981 assert!((x - 10.0).abs() < 1e-10);
982 assert!((y - 20.0).abs() < 1e-10);
983 }
984
985 #[test]
986 fn test_matrix_translate() {
987 let m = Matrix::translate(100.0, 200.0);
988 let (x, y) = m.transform_point(0.0, 0.0);
989 assert!((x - 100.0).abs() < 1e-10);
990 assert!((y - 200.0).abs() < 1e-10);
991 }
992
993 #[test]
994 fn test_matrix_concat() {
995 let a = Matrix::translate(10.0, 20.0);
996 let b = Matrix::translate(30.0, 40.0);
997 let c = a.concat(&b);
998 let (x, y) = c.transform_point(0.0, 0.0);
999 assert!((x - 40.0).abs() < 1e-10);
1000 assert!((y - 60.0).abs() < 1e-10);
1001 }
1002
1003 #[test]
1004 fn test_matrix_scale() {
1005 let m = Matrix {
1006 a: 2.0,
1007 b: 0.0,
1008 c: 0.0,
1009 d: 3.0,
1010 e: 0.0,
1011 f: 0.0,
1012 };
1013 let (x, y) = m.transform_point(10.0, 10.0);
1014 assert!((x - 20.0).abs() < 1e-10);
1015 assert!((y - 30.0).abs() < 1e-10);
1016 }
1017
1018 #[test]
1019 fn test_interpreter_basic_text() {
1020 let ops = vec![
1022 ContentOp {
1023 operator: b"BT".to_vec(),
1024 operands: vec![],
1025 },
1026 ContentOp {
1027 operator: b"Tf".to_vec(),
1028 operands: vec![
1029 Operand::Name(b"F1".to_vec()),
1030 Operand::Integer(12),
1031 ],
1032 },
1033 ContentOp {
1034 operator: b"Td".to_vec(),
1035 operands: vec![Operand::Integer(72), Operand::Integer(720)],
1036 },
1037 ContentOp {
1038 operator: b"Tj".to_vec(),
1039 operands: vec![Operand::String(b"Hello".to_vec())],
1040 },
1041 ContentOp {
1042 operator: b"ET".to_vec(),
1043 operands: vec![],
1044 },
1045 ];
1046
1047 let mut fonts = HashMap::new();
1049 fonts.insert(
1050 b"F1".to_vec(),
1051 ResolvedFont {
1052 info: FontInfo {
1053 base_font: b"Helvetica".to_vec(),
1054 subtype: b"Type1".to_vec(),
1055 encoding: Encoding::WinAnsiEncoding,
1056 widths: crate::font::FontWidths::None {
1057 default_width: 600.0,
1058 },
1059 to_unicode: None,
1060 is_standard14: true,
1061 descriptor: None,
1062 },
1063 cmap: None,
1064 },
1065 );
1066
1067 let interpreter = TextInterpreter::new(fonts);
1068 let chars = interpreter.run(&ops);
1069
1070 assert_eq!(chars.len(), 5);
1071 assert_eq!(chars[0].unicode, "H");
1072 assert_eq!(chars[1].unicode, "e");
1073 assert_eq!(chars[4].unicode, "o");
1074 assert!((chars[0].x - 72.0).abs() < 0.01);
1076 assert!((chars[0].y - 720.0).abs() < 0.01);
1077 }
1078
1079 #[test]
1080 fn test_interpreter_tj_array() {
1081 let ops = vec![
1083 ContentOp {
1084 operator: b"BT".to_vec(),
1085 operands: vec![],
1086 },
1087 ContentOp {
1088 operator: b"Tf".to_vec(),
1089 operands: vec![
1090 Operand::Name(b"F1".to_vec()),
1091 Operand::Integer(12),
1092 ],
1093 },
1094 ContentOp {
1095 operator: b"TJ".to_vec(),
1096 operands: vec![Operand::Array(vec![
1097 Operand::String(b"H".to_vec()),
1098 Operand::Integer(-100),
1099 Operand::String(b"i".to_vec()),
1100 ])],
1101 },
1102 ContentOp {
1103 operator: b"ET".to_vec(),
1104 operands: vec![],
1105 },
1106 ];
1107
1108 let mut fonts = HashMap::new();
1109 fonts.insert(
1110 b"F1".to_vec(),
1111 ResolvedFont {
1112 info: FontInfo {
1113 base_font: b"Helvetica".to_vec(),
1114 subtype: b"Type1".to_vec(),
1115 encoding: Encoding::WinAnsiEncoding,
1116 widths: crate::font::FontWidths::None {
1117 default_width: 500.0,
1118 },
1119 to_unicode: None,
1120 is_standard14: true,
1121 descriptor: None,
1122 },
1123 cmap: None,
1124 },
1125 );
1126
1127 let interpreter = TextInterpreter::new(fonts);
1128 let chars = interpreter.run(&ops);
1129
1130 assert_eq!(chars.len(), 2);
1131 assert_eq!(chars[0].unicode, "H");
1132 assert_eq!(chars[1].unicode, "i");
1133 let h_advance = 500.0 / 1000.0 * 12.0; let kern = 100.0 / 1000.0 * 12.0; let expected_i_x = h_advance + kern;
1137 assert!((chars[1].x - expected_i_x).abs() < 0.01);
1138 }
1139
1140 #[test]
1141 fn test_word_grouping() {
1142 let chars = vec![
1143 TextChar {
1144 unicode: "H".into(),
1145 x: 72.0,
1146 y: 720.0,
1147 font_size: 12.0,
1148 font_name: "F1".into(),
1149 width: 7.0,
1150 },
1151 TextChar {
1152 unicode: "i".into(),
1153 x: 79.0,
1154 y: 720.0,
1155 font_size: 12.0,
1156 font_name: "F1".into(),
1157 width: 3.0,
1158 },
1159 TextChar {
1160 unicode: " ".into(),
1161 x: 82.0,
1162 y: 720.0,
1163 font_size: 12.0,
1164 font_name: "F1".into(),
1165 width: 3.0,
1166 },
1167 TextChar {
1168 unicode: "A".into(),
1169 x: 90.0,
1170 y: 720.0,
1171 font_size: 12.0,
1172 font_name: "F1".into(),
1173 width: 7.0,
1174 },
1175 ];
1176
1177 let words = group_into_words(&chars);
1178 assert_eq!(words.len(), 2);
1179 assert_eq!(words[0].text, "Hi");
1180 assert_eq!(words[1].text, "A");
1181 }
1182
1183 #[test]
1184 fn test_line_grouping() {
1185 let words = vec![
1186 TextWord {
1187 text: "Hello".into(),
1188 x: 72.0,
1189 y: 720.0,
1190 width: 30.0,
1191 font_size: 12.0,
1192 },
1193 TextWord {
1194 text: "World".into(),
1195 x: 110.0,
1196 y: 720.0,
1197 width: 30.0,
1198 font_size: 12.0,
1199 },
1200 TextWord {
1201 text: "Next".into(),
1202 x: 72.0,
1203 y: 700.0,
1204 width: 24.0,
1205 font_size: 12.0,
1206 },
1207 ];
1208
1209 let lines = group_into_lines(&words);
1210 assert_eq!(lines.len(), 2);
1211 assert_eq!(lines[0].text, "Hello World");
1212 assert_eq!(lines[1].text, "Next");
1213 }
1214
1215 #[test]
1216 fn test_interpreter_multiline() {
1217 let ops = vec![
1219 ContentOp {
1220 operator: b"BT".to_vec(),
1221 operands: vec![],
1222 },
1223 ContentOp {
1224 operator: b"Tf".to_vec(),
1225 operands: vec![
1226 Operand::Name(b"F1".to_vec()),
1227 Operand::Integer(12),
1228 ],
1229 },
1230 ContentOp {
1231 operator: b"Td".to_vec(),
1232 operands: vec![Operand::Integer(72), Operand::Integer(720)],
1233 },
1234 ContentOp {
1235 operator: b"Tj".to_vec(),
1236 operands: vec![Operand::String(b"Line1".to_vec())],
1237 },
1238 ContentOp {
1239 operator: b"Td".to_vec(),
1240 operands: vec![Operand::Integer(0), Operand::Integer(-14)],
1241 },
1242 ContentOp {
1243 operator: b"Tj".to_vec(),
1244 operands: vec![Operand::String(b"Line2".to_vec())],
1245 },
1246 ContentOp {
1247 operator: b"ET".to_vec(),
1248 operands: vec![],
1249 },
1250 ];
1251
1252 let mut fonts = HashMap::new();
1253 fonts.insert(
1254 b"F1".to_vec(),
1255 ResolvedFont {
1256 info: FontInfo {
1257 base_font: b"Courier".to_vec(),
1258 subtype: b"Type1".to_vec(),
1259 encoding: Encoding::WinAnsiEncoding,
1260 widths: crate::font::FontWidths::None {
1261 default_width: 600.0,
1262 },
1263 to_unicode: None,
1264 is_standard14: true,
1265 descriptor: None,
1266 },
1267 cmap: None,
1268 },
1269 );
1270
1271 let interpreter = TextInterpreter::new(fonts);
1272 let chars = interpreter.run(&ops);
1273
1274 assert_eq!(chars.len(), 10);
1275 assert!((chars[0].y - 720.0).abs() < 0.01);
1277 assert!((chars[5].y - 706.0).abs() < 0.01);
1279
1280 let words = group_into_words(&chars);
1281 let lines = group_into_lines(&words);
1282 assert_eq!(lines.len(), 2);
1283 assert_eq!(lines[0].text, "Line1");
1284 assert_eq!(lines[1].text, "Line2");
1285 }
1286
1287 #[test]
1288 fn test_empty_chars() {
1289 let words = group_into_words(&[]);
1290 assert!(words.is_empty());
1291 let lines = group_into_lines(&[]);
1292 assert!(lines.is_empty());
1293 let blocks = layout::detect_columns_and_reorder(&[]);
1294 assert!(blocks.is_empty());
1295 }
1296
1297 #[test]
1298 fn test_graphics_state_save_restore() {
1299 let ops = vec![
1301 ContentOp {
1302 operator: b"q".to_vec(),
1303 operands: vec![],
1304 },
1305 ContentOp {
1306 operator: b"cm".to_vec(),
1307 operands: vec![
1308 Operand::Integer(2),
1309 Operand::Integer(0),
1310 Operand::Integer(0),
1311 Operand::Integer(2),
1312 Operand::Integer(0),
1313 Operand::Integer(0),
1314 ],
1315 },
1316 ContentOp {
1317 operator: b"BT".to_vec(),
1318 operands: vec![],
1319 },
1320 ContentOp {
1321 operator: b"Tf".to_vec(),
1322 operands: vec![
1323 Operand::Name(b"F1".to_vec()),
1324 Operand::Integer(12),
1325 ],
1326 },
1327 ContentOp {
1328 operator: b"Tj".to_vec(),
1329 operands: vec![Operand::String(b"A".to_vec())],
1330 },
1331 ContentOp {
1332 operator: b"ET".to_vec(),
1333 operands: vec![],
1334 },
1335 ContentOp {
1336 operator: b"Q".to_vec(),
1337 operands: vec![],
1338 },
1339 ContentOp {
1340 operator: b"BT".to_vec(),
1341 operands: vec![],
1342 },
1343 ContentOp {
1344 operator: b"Tf".to_vec(),
1345 operands: vec![
1346 Operand::Name(b"F1".to_vec()),
1347 Operand::Integer(12),
1348 ],
1349 },
1350 ContentOp {
1351 operator: b"Tj".to_vec(),
1352 operands: vec![Operand::String(b"B".to_vec())],
1353 },
1354 ContentOp {
1355 operator: b"ET".to_vec(),
1356 operands: vec![],
1357 },
1358 ];
1359
1360 let mut fonts = HashMap::new();
1361 fonts.insert(
1362 b"F1".to_vec(),
1363 ResolvedFont {
1364 info: FontInfo {
1365 base_font: b"Helvetica".to_vec(),
1366 subtype: b"Type1".to_vec(),
1367 encoding: Encoding::WinAnsiEncoding,
1368 widths: crate::font::FontWidths::None {
1369 default_width: 600.0,
1370 },
1371 to_unicode: None,
1372 is_standard14: true,
1373 descriptor: None,
1374 },
1375 cmap: None,
1376 },
1377 );
1378
1379 let interpreter = TextInterpreter::new(fonts);
1380 let chars = interpreter.run(&ops);
1381
1382 assert_eq!(chars.len(), 2);
1383 assert_eq!(chars[0].unicode, "A");
1384 assert_eq!(chars[1].unicode, "B");
1385 assert!((chars[0].font_size - 24.0).abs() < 0.01); assert!((chars[1].font_size - 12.0).abs() < 0.01); }
1391}