1use lopdf::{content::Content, Dictionary, Document, Object, ObjectId};
15
16use crate::models::bbox::{BoundingBox, Vertex};
17use crate::models::chunks::{ImageChunk, LineArtChunk, LineChunk, TextChunk};
18use crate::EdgePdfError;
19
20use super::font::{resolve_page_fonts, FontCache, PdfFont};
21use super::graphics_state::{GraphicsStateStack, Matrix};
22
23const MAX_FORM_RECURSION_DEPTH: u32 = 10;
25
26const MIN_LINE_WIDTH: f64 = 0.1;
28
29const LINE_ASPECT_RATIO: f64 = 3.0;
31
32const MAX_LINE_THICKNESS: f64 = 10.0;
34
35#[derive(Debug, Default)]
37pub struct PageChunks {
38 pub text_chunks: Vec<TextChunk>,
40 pub image_chunks: Vec<ImageChunk>,
42 pub line_chunks: Vec<LineChunk>,
44 pub line_art_chunks: Vec<LineArtChunk>,
46}
47
48pub fn extract_page_chunks(
50 doc: &Document,
51 page_number: u32,
52 page_id: ObjectId,
53) -> Result<PageChunks, EdgePdfError> {
54 let font_cache = resolve_page_fonts(doc, page_id);
55
56 let page_dict = doc
57 .get_object(page_id)
58 .map_err(|e| EdgePdfError::PipelineError {
59 stage: 1,
60 message: format!("Failed to get page {}: {}", page_number, e),
61 })?
62 .as_dict()
63 .map_err(|e| EdgePdfError::PipelineError {
64 stage: 1,
65 message: format!("Page {} is not a dictionary: {}", page_number, e),
66 })?
67 .clone();
68
69 let content_data = super::text_extractor::get_page_content(doc, &page_dict)?;
71 if content_data.is_empty() {
72 return Ok(PageChunks::default());
73 }
74
75 let content = Content::decode(&content_data).map_err(|e| EdgePdfError::PipelineError {
77 stage: 1,
78 message: format!(
79 "Failed to decode content stream for page {}: {}",
80 page_number, e
81 ),
82 })?;
83
84 let resources = resolve_page_resources(doc, &page_dict);
86
87 let mut parser = ChunkParserState::new(page_number, font_cache);
88 parser.process_operations(doc, &content.operations, &resources, 0);
89
90 Ok(parser.into_page_chunks())
91}
92
93fn resolve_page_resources(doc: &Document, page_dict: &Dictionary) -> Dictionary {
95 match page_dict.get(b"Resources") {
96 Ok(obj) => {
97 let resolved = resolve_obj(doc, obj);
98 resolved.as_dict().cloned().unwrap_or_default()
99 }
100 Err(_) => Dictionary::new(),
101 }
102}
103
104struct ChunkParserState {
106 page_number: u32,
107 font_cache: FontCache,
108 gs_stack: GraphicsStateStack,
109
110 text_chunks: Vec<TextChunk>,
112 image_chunks: Vec<ImageChunk>,
113 line_chunks: Vec<LineChunk>,
114 line_art_chunks: Vec<LineArtChunk>,
115
116 text_index: usize,
118 image_index: u32,
119 line_index: u32,
120
121 mcid_stack: Vec<Option<i64>>,
123
124 current_path: Vec<PathSegment>,
126 subpath_start: Option<(f64, f64)>,
127 current_point: Option<(f64, f64)>,
128 line_width: f64,
129}
130
131impl ChunkParserState {
132 fn new(page_number: u32, font_cache: FontCache) -> Self {
133 Self {
134 page_number,
135 font_cache,
136 gs_stack: GraphicsStateStack::default(),
137
138 text_chunks: Vec::new(),
139 image_chunks: Vec::new(),
140 line_chunks: Vec::new(),
141 line_art_chunks: Vec::new(),
142
143 text_index: 0,
144 image_index: 0,
145 line_index: 0,
146
147 mcid_stack: Vec::new(),
148
149 current_path: Vec::new(),
150 subpath_start: None,
151 current_point: None,
152 line_width: 1.0,
153 }
154 }
155
156 fn into_page_chunks(self) -> PageChunks {
157 PageChunks {
158 text_chunks: self.text_chunks,
159 image_chunks: self.image_chunks,
160 line_chunks: self.line_chunks,
161 line_art_chunks: self.line_art_chunks,
162 }
163 }
164
165 fn process_operations(
167 &mut self,
168 doc: &Document,
169 operations: &[lopdf::content::Operation],
170 resources: &Dictionary,
171 recursion_depth: u32,
172 ) {
173 for op in operations {
174 match op.operator.as_str() {
175 "BMC" => {
177 self.mcid_stack.push(None);
178 }
179 "BDC" => {
180 let mcid = extract_mcid_from_bdc(&op.operands);
181 self.mcid_stack.push(mcid);
182 }
183 "EMC" => {
184 self.mcid_stack.pop();
185 }
186
187 "q" => self.gs_stack.save(),
189 "Q" => self.gs_stack.restore(),
190 "cm" => {
191 if op.operands.len() == 6 {
192 let vals: Vec<f64> = op
193 .operands
194 .iter()
195 .filter_map(|o| obj_to_f64(o.clone()))
196 .collect();
197 if vals.len() == 6 {
198 self.gs_stack
199 .concat_ctm(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5]);
200 }
201 }
202 }
203 "gs" => {
204 if let Some(name) = op.operands.first().and_then(obj_name_bytes) {
206 self.apply_ext_gstate(doc, resources, &name);
207 }
208 }
209
210 "BT" => self.gs_stack.current.begin_text(),
212 "ET" => {}
213
214 "Tf" => {
215 if op.operands.len() == 2 {
216 if let Object::Name(ref name) = op.operands[0] {
217 self.gs_stack.current.text_state.font_name =
218 String::from_utf8_lossy(name).to_string();
219 }
220 if let Some(size) = obj_to_f64(op.operands[1].clone()) {
221 self.gs_stack.current.text_state.font_size = size;
222 }
223 }
224 }
225 "Tc" => {
226 if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
227 self.gs_stack.current.text_state.char_spacing = v;
228 }
229 }
230 "Tw" => {
231 if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
232 self.gs_stack.current.text_state.word_spacing = v;
233 }
234 }
235 "Tz" => {
236 if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
237 self.gs_stack.current.text_state.horizontal_scaling = v;
238 }
239 }
240 "TL" => {
241 if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
242 self.gs_stack.current.text_state.leading = v;
243 }
244 }
245 "Ts" => {
246 if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
247 self.gs_stack.current.text_state.rise = v;
248 }
249 }
250 "Tr" => {
251 if let Some(v) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
252 self.gs_stack.current.text_state.render_mode = v as i32;
253 }
254 }
255
256 "Td" => {
258 if op.operands.len() == 2 {
259 let tx = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
260 let ty = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
261 self.gs_stack.current.translate_text(tx, ty);
262 }
263 }
264 "TD" => {
265 if op.operands.len() == 2 {
266 let tx = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
267 let ty = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
268 self.gs_stack.current.text_state.leading = -ty;
269 self.gs_stack.current.translate_text(tx, ty);
270 }
271 }
272 "Tm" => {
273 if op.operands.len() == 6 {
274 let vals: Vec<f64> = op
275 .operands
276 .iter()
277 .filter_map(|o| obj_to_f64(o.clone()))
278 .collect();
279 if vals.len() == 6 {
280 self.gs_stack.current.set_text_matrix(
281 vals[0], vals[1], vals[2], vals[3], vals[4], vals[5],
282 );
283 }
284 }
285 }
286 "T*" => {
287 self.gs_stack.current.next_line();
288 }
289
290 "Tj" => {
292 if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
293 self.emit_text_chunk(&text_bytes);
294 }
295 }
296 "TJ" => {
297 if let Some(Object::Array(ref arr)) = op.operands.first() {
298 self.process_tj_array(arr);
299 }
300 }
301 "'" => {
302 self.gs_stack.current.next_line();
303 if let Some(text_bytes) = op.operands.first().and_then(extract_string_bytes) {
304 self.emit_text_chunk(&text_bytes);
305 }
306 }
307 "\"" => {
308 if op.operands.len() == 3 {
309 if let Some(aw) = obj_to_f64(op.operands[0].clone()) {
310 self.gs_stack.current.text_state.word_spacing = aw;
311 }
312 if let Some(ac) = obj_to_f64(op.operands[1].clone()) {
313 self.gs_stack.current.text_state.char_spacing = ac;
314 }
315 self.gs_stack.current.next_line();
316 if let Some(text_bytes) = extract_string_bytes(&op.operands[2]) {
317 self.emit_text_chunk(&text_bytes);
318 }
319 }
320 }
321
322 "g" => {
324 if let Some(gray) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
325 self.gs_stack.current.fill_color = vec![gray];
326 self.gs_stack.current.fill_color_space_components = 1;
327 }
328 }
329 "G" => {
330 if let Some(gray) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
331 self.gs_stack.current.stroke_color = vec![gray];
332 self.gs_stack.current.stroke_color_space_components = 1;
333 }
334 }
335 "rg" => {
336 if op.operands.len() == 3 {
337 let r = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
338 let g = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
339 let b = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
340 self.gs_stack.current.fill_color = vec![r, g, b];
341 self.gs_stack.current.fill_color_space_components = 3;
342 }
343 }
344 "RG" => {
345 if op.operands.len() == 3 {
346 let r = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
347 let g = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
348 let b = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
349 self.gs_stack.current.stroke_color = vec![r, g, b];
350 self.gs_stack.current.stroke_color_space_components = 3;
351 }
352 }
353 "k" => {
354 if op.operands.len() == 4 {
355 let c = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
356 let m = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
357 let y = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
358 let kk = obj_to_f64(op.operands[3].clone()).unwrap_or(0.0);
359 self.gs_stack.current.fill_color = vec![c, m, y, kk];
360 self.gs_stack.current.fill_color_space_components = 4;
361 }
362 }
363 "K" => {
364 if op.operands.len() == 4 {
365 let c = obj_to_f64(op.operands[0].clone()).unwrap_or(0.0);
366 let m = obj_to_f64(op.operands[1].clone()).unwrap_or(0.0);
367 let y = obj_to_f64(op.operands[2].clone()).unwrap_or(0.0);
368 let kk = obj_to_f64(op.operands[3].clone()).unwrap_or(0.0);
369 self.gs_stack.current.stroke_color = vec![c, m, y, kk];
370 self.gs_stack.current.stroke_color_space_components = 4;
371 }
372 }
373 "cs" => {
374 if let Some(name) = op.operands.first() {
375 let cs_name = obj_to_name(name);
376 let comps = color_space_components(&cs_name);
377 self.gs_stack.current.fill_color_space_components = comps;
378 self.gs_stack.current.fill_color = default_color_for_space(comps);
380 }
381 }
382 "CS" => {
383 if let Some(name) = op.operands.first() {
384 let cs_name = obj_to_name(name);
385 let comps = color_space_components(&cs_name);
386 self.gs_stack.current.stroke_color_space_components = comps;
387 self.gs_stack.current.stroke_color = default_color_for_space(comps);
389 }
390 }
391 "sc" | "scn" => {
392 let components: Vec<f64> = op
393 .operands
394 .iter()
395 .filter_map(|o| obj_to_f64(o.clone()))
396 .collect();
397 if !components.is_empty() {
398 self.gs_stack.current.fill_color = components;
399 }
400 }
401 "SC" | "SCN" => {
402 let components: Vec<f64> = op
403 .operands
404 .iter()
405 .filter_map(|o| obj_to_f64(o.clone()))
406 .collect();
407 if !components.is_empty() {
408 self.gs_stack.current.stroke_color = components;
409 }
410 }
411
412 "w" => {
414 if let Some(w) = op.operands.first().and_then(|o| obj_to_f64(o.clone())) {
415 self.line_width = w;
416 }
417 }
418
419 "m" => {
421 if op.operands.len() >= 2 {
422 if let (Some(x), Some(y)) = (
423 op.operands.first().and_then(|o| obj_to_f64(o.clone())),
424 op.operands.get(1).and_then(|o| obj_to_f64(o.clone())),
425 ) {
426 let (tx, ty) = self.transform_point(x, y);
427 self.subpath_start = Some((tx, ty));
428 self.current_point = Some((tx, ty));
429 }
430 }
431 }
432 "l" => {
433 if op.operands.len() >= 2 {
434 if let (Some(x), Some(y)) = (
435 op.operands.first().and_then(|o| obj_to_f64(o.clone())),
436 op.operands.get(1).and_then(|o| obj_to_f64(o.clone())),
437 ) {
438 let (tx, ty) = self.transform_point(x, y);
439 if let Some((cx, cy)) = self.current_point {
440 self.current_path.push(PathSegment::Line {
441 x1: cx,
442 y1: cy,
443 x2: tx,
444 y2: ty,
445 });
446 }
447 self.current_point = Some((tx, ty));
448 }
449 }
450 }
451 "c" => {
452 if op.operands.len() >= 6 {
453 let vals: Vec<f64> = op
454 .operands
455 .iter()
456 .filter_map(|o| obj_to_f64(o.clone()))
457 .collect();
458 if vals.len() >= 6 {
459 let (tx, ty) = self.transform_point(vals[4], vals[5]);
460 if let Some((cx, cy)) = self.current_point {
461 let (cp1x, cp1y) = self.transform_point(vals[0], vals[1]);
462 let (cp2x, cp2y) = self.transform_point(vals[2], vals[3]);
463 self.current_path.push(PathSegment::Curve {
464 x1: cx,
465 y1: cy,
466 cp1x,
467 cp1y,
468 cp2x,
469 cp2y,
470 x2: tx,
471 y2: ty,
472 });
473 }
474 self.current_point = Some((tx, ty));
475 }
476 }
477 }
478 "v" => {
479 if op.operands.len() >= 4 {
480 let vals: Vec<f64> = op
481 .operands
482 .iter()
483 .filter_map(|o| obj_to_f64(o.clone()))
484 .collect();
485 if vals.len() >= 4 {
486 let (tx, ty) = self.transform_point(vals[2], vals[3]);
487 if let Some((cx, cy)) = self.current_point {
488 let (cp2x, cp2y) = self.transform_point(vals[0], vals[1]);
489 self.current_path.push(PathSegment::Curve {
490 x1: cx,
491 y1: cy,
492 cp1x: cx,
493 cp1y: cy,
494 cp2x,
495 cp2y,
496 x2: tx,
497 y2: ty,
498 });
499 }
500 self.current_point = Some((tx, ty));
501 }
502 }
503 }
504 "y" => {
505 if op.operands.len() >= 4 {
506 let vals: Vec<f64> = op
507 .operands
508 .iter()
509 .filter_map(|o| obj_to_f64(o.clone()))
510 .collect();
511 if vals.len() >= 4 {
512 let (tx, ty) = self.transform_point(vals[2], vals[3]);
513 if let Some((cx, cy)) = self.current_point {
514 let (cp1x, cp1y) = self.transform_point(vals[0], vals[1]);
515 self.current_path.push(PathSegment::Curve {
516 x1: cx,
517 y1: cy,
518 cp1x,
519 cp1y,
520 cp2x: tx,
521 cp2y: ty,
522 x2: tx,
523 y2: ty,
524 });
525 }
526 self.current_point = Some((tx, ty));
527 }
528 }
529 }
530 "h" => {
531 if let (Some((sx, sy)), Some((cx, cy))) =
532 (self.subpath_start, self.current_point)
533 {
534 if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
535 self.current_path.push(PathSegment::Line {
536 x1: cx,
537 y1: cy,
538 x2: sx,
539 y2: sy,
540 });
541 }
542 self.current_point = self.subpath_start;
543 }
544 }
545 "re" => {
546 if op.operands.len() >= 4 {
547 let vals: Vec<f64> = op
548 .operands
549 .iter()
550 .filter_map(|o| obj_to_f64(o.clone()))
551 .collect();
552 if vals.len() >= 4 {
553 let (x, y, w, h) = (vals[0], vals[1], vals[2], vals[3]);
554 let (x1, y1) = self.transform_point(x, y);
555 let (x2, y2) = self.transform_point(x + w, y);
556 let (x3, y3) = self.transform_point(x + w, y + h);
557 let (x4, y4) = self.transform_point(x, y + h);
558 self.current_path.push(PathSegment::Line { x1, y1, x2, y2 });
559 self.current_path.push(PathSegment::Line {
560 x1: x2,
561 y1: y2,
562 x2: x3,
563 y2: y3,
564 });
565 self.current_path.push(PathSegment::Line {
566 x1: x3,
567 y1: y3,
568 x2: x4,
569 y2: y4,
570 });
571 self.current_path.push(PathSegment::Line {
572 x1: x4,
573 y1: y4,
574 x2: x1,
575 y2: y1,
576 });
577 self.subpath_start = Some((x1, y1));
578 self.current_point = Some((x1, y1));
579 }
580 }
581 }
582
583 "S" => {
585 self.classify_and_emit_path();
586 }
587 "s" => {
588 self.close_subpath();
590 self.classify_and_emit_path();
591 }
592 "f" | "F" | "f*" => {
593 self.classify_and_emit_path();
594 }
595 "B" | "B*" | "b" | "b*" => {
596 if op.operator.starts_with('b') {
597 self.close_subpath();
598 }
599 self.classify_and_emit_path();
600 }
601 "n" => {
602 self.current_path.clear();
604 self.subpath_start = None;
605 self.current_point = None;
606 }
607
608 "Do" => {
610 if let Some(name_bytes) = op.operands.first().and_then(obj_name_bytes) {
611 self.handle_do_operator(doc, resources, &name_bytes, recursion_depth);
612 }
613 }
614
615 "BI" => {
620 self.emit_inline_image();
621 }
622
623 _ => {
624 }
626 }
627 }
628 }
629
630 fn emit_text_chunk(&mut self, text_bytes: &[u8]) {
633 if text_bytes.is_empty() {
634 return;
635 }
636
637 let font = self
638 .font_cache
639 .get(&self.gs_stack.current.text_state.font_name)
640 .cloned()
641 .unwrap_or_else(|| PdfFont::default_font(&self.gs_stack.current.text_state.font_name));
642 let active_mcid = self.active_mcid();
643
644 if let Some(chunk) = create_text_chunk(
645 text_bytes,
646 &font,
647 &mut self.gs_stack,
648 self.page_number,
649 &mut self.text_index,
650 active_mcid,
651 ) {
652 self.text_chunks.push(chunk);
653 }
654 }
655
656 fn process_tj_array(&mut self, arr: &[Object]) {
657 let font = self
658 .font_cache
659 .get(&self.gs_stack.current.text_state.font_name)
660 .cloned()
661 .unwrap_or_else(|| PdfFont::default_font(&self.gs_stack.current.text_state.font_name));
662 let active_mcid = self.active_mcid();
663
664 for item in arr {
665 match item {
666 Object::String(bytes, _) => {
667 if let Some(chunk) = create_text_chunk(
668 bytes,
669 &font,
670 &mut self.gs_stack,
671 self.page_number,
672 &mut self.text_index,
673 active_mcid,
674 ) {
675 self.text_chunks.push(chunk);
676 }
677 }
678 _ => {
679 if let Some(adj) = obj_to_f64(item.clone()) {
680 let displacement =
681 -adj / 1000.0 * self.gs_stack.current.text_state.font_size;
682 self.gs_stack.current.advance_text(displacement);
683 }
684 }
685 }
686 }
687 }
688
689 fn handle_do_operator(
693 &mut self,
694 doc: &Document,
695 resources: &Dictionary,
696 name_bytes: &[u8],
697 recursion_depth: u32,
698 ) {
699 let xobject_dict = match resources.get(b"XObject") {
701 Ok(obj) => {
702 let resolved = resolve_obj(doc, obj);
703 match resolved.as_dict() {
704 Ok(d) => d.clone(),
705 Err(_) => return,
706 }
707 }
708 Err(_) => return,
709 };
710
711 let xobj_ref = match xobject_dict.get(name_bytes) {
712 Ok(obj) => resolve_obj(doc, obj),
713 Err(_) => return,
714 };
715
716 let stream = match xobj_ref.as_stream() {
717 Ok(s) => s.clone(),
718 Err(_) => return,
719 };
720
721 let subtype = stream
722 .dict
723 .get(b"Subtype")
724 .ok()
725 .and_then(|o| match resolve_obj(doc, o) {
726 Object::Name(n) => Some(String::from_utf8_lossy(&n).to_string()),
727 _ => None,
728 });
729
730 match subtype.as_deref() {
731 Some("Image") => {
732 self.emit_image_from_ctm();
734 }
735 Some("Form") => {
736 if recursion_depth < MAX_FORM_RECURSION_DEPTH {
738 self.process_form_xobject(doc, &stream, resources, recursion_depth);
739 }
740 }
741 _ => {}
742 }
743 }
744
745 fn emit_image_from_ctm(&mut self) {
748 let ctm = &self.gs_stack.current.ctm;
749
750 let (x0, y0) = ctm.transform_point(0.0, 0.0);
752 let (x1, y1) = ctm.transform_point(1.0, 0.0);
753 let (x2, y2) = ctm.transform_point(1.0, 1.0);
754 let (x3, y3) = ctm.transform_point(0.0, 1.0);
755
756 let min_x = x0.min(x1).min(x2).min(x3);
757 let max_x = x0.max(x1).max(x2).max(x3);
758 let min_y = y0.min(y1).min(y2).min(y3);
759 let max_y = y0.max(y1).max(y2).max(y3);
760
761 if (max_x - min_x).abs() < 0.1 || (max_y - min_y).abs() < 0.1 {
763 return;
764 }
765
766 self.image_index += 1;
767 self.image_chunks.push(ImageChunk {
768 bbox: BoundingBox::new(Some(self.page_number), min_x, min_y, max_x, max_y),
769 index: Some(self.image_index),
770 level: None,
771 });
772 }
773
774 fn emit_inline_image(&mut self) {
776 self.emit_image_from_ctm();
778 }
779
780 fn process_form_xobject(
782 &mut self,
783 doc: &Document,
784 stream: &lopdf::Stream,
785 parent_resources: &Dictionary,
786 recursion_depth: u32,
787 ) {
788 let form_matrix = get_form_matrix(doc, &stream.dict);
790
791 self.gs_stack.save();
793 let m = form_matrix;
794 self.gs_stack.concat_ctm(m.a, m.b, m.c, m.d, m.e, m.f);
795
796 let form_resources = match stream.dict.get(b"Resources") {
798 Ok(obj) => {
799 let resolved = resolve_obj(doc, obj);
800 resolved
801 .as_dict()
802 .cloned()
803 .unwrap_or_else(|_| parent_resources.clone())
804 }
805 Err(_) => parent_resources.clone(),
806 };
807
808 let form_content = if stream.dict.get(b"Filter").is_ok() {
810 match stream.decompressed_content() {
811 Ok(data) => data,
812 Err(_) => {
813 self.gs_stack.restore();
814 return;
815 }
816 }
817 } else {
818 stream.content.clone()
819 };
820
821 if form_content.is_empty() {
822 self.gs_stack.restore();
823 return;
824 }
825
826 if let Ok(content) = Content::decode(&form_content) {
828 let form_font_cache = resolve_form_fonts(doc, &form_resources);
830 let mut merged_cache = FontCache::default();
831 for (name, font) in self.font_cache.iter() {
833 merged_cache.insert(name.clone(), font.clone());
834 }
835 for (name, font) in form_font_cache.iter() {
837 merged_cache.insert(name.clone(), font.clone());
838 }
839
840 let saved_fc = std::mem::replace(&mut self.font_cache, merged_cache);
841 self.process_operations(
842 doc,
843 &content.operations,
844 &form_resources,
845 recursion_depth + 1,
846 );
847 self.font_cache = saved_fc;
848 }
849
850 self.gs_stack.restore();
851 }
852
853 fn apply_ext_gstate(&mut self, doc: &Document, resources: &Dictionary, name: &[u8]) {
856 let ext_gstate_dict = match resources.get(b"ExtGState") {
857 Ok(obj) => {
858 let resolved = resolve_obj(doc, obj);
859 match resolved.as_dict() {
860 Ok(d) => d.clone(),
861 Err(_) => return,
862 }
863 }
864 Err(_) => return,
865 };
866
867 let gs_obj = match ext_gstate_dict.get(name) {
868 Ok(obj) => resolve_obj(doc, obj),
869 Err(_) => return,
870 };
871
872 let gs_dict = match gs_obj.as_dict() {
873 Ok(d) => d,
874 Err(_) => return,
875 };
876
877 if let Ok(font_arr) = gs_dict.get(b"Font") {
880 if let Ok(arr) = resolve_obj(doc, font_arr).as_array() {
881 if arr.len() >= 2 {
882 if let Object::Name(ref name) = arr[0] {
883 self.gs_stack.current.text_state.font_name =
884 String::from_utf8_lossy(name).to_string();
885 }
886 if let Some(size) = obj_to_f64(arr[1].clone()) {
887 self.gs_stack.current.text_state.font_size = size;
888 }
889 }
890 }
891 }
892
893 if let Ok(lw) = gs_dict.get(b"LW") {
895 if let Some(w) = obj_to_f64(resolve_obj(doc, lw)) {
896 self.line_width = w;
897 }
898 }
899 }
900
901 fn close_subpath(&mut self) {
904 if let (Some((sx, sy)), Some((cx, cy))) = (self.subpath_start, self.current_point) {
905 if (sx - cx).abs() > 0.01 || (sy - cy).abs() > 0.01 {
906 self.current_path.push(PathSegment::Line {
907 x1: cx,
908 y1: cy,
909 x2: sx,
910 y2: sy,
911 });
912 }
913 self.current_point = self.subpath_start;
914 }
915 }
916
917 fn classify_and_emit_path(&mut self) {
918 let path = std::mem::take(&mut self.current_path);
919 self.subpath_start = None;
920 self.current_point = None;
921
922 if path.is_empty() || self.line_width < MIN_LINE_WIDTH {
923 return;
924 }
925
926 let has_curves = path.iter().any(|s| matches!(s, PathSegment::Curve { .. }));
927
928 if !has_curves && path.len() <= 4 {
930 let mut classified_lines = Vec::new();
931 for seg in &path {
932 if let PathSegment::Line { x1, y1, x2, y2 } = seg {
933 let dx = (x2 - x1).abs();
934 let dy = (y2 - y1).abs();
935 let length = (dx * dx + dy * dy).sqrt();
936
937 if length < MIN_LINE_WIDTH {
938 continue;
939 }
940
941 let is_horizontal = dy < MAX_LINE_THICKNESS && dx > dy * LINE_ASPECT_RATIO;
942 let is_vertical = dx < MAX_LINE_THICKNESS && dy > dx * LINE_ASPECT_RATIO;
943
944 if is_horizontal || is_vertical {
945 self.line_index += 1;
946 let min_x = x1.min(*x2);
947 let max_x = x1.max(*x2);
948 let min_y = y1.min(*y2);
949 let max_y = y1.max(*y2);
950 let half_w = self.line_width / 2.0;
951
952 classified_lines.push(LineChunk {
953 bbox: BoundingBox::new(
954 Some(self.page_number),
955 min_x - if is_vertical { half_w } else { 0.0 },
956 min_y - if is_horizontal { half_w } else { 0.0 },
957 max_x + if is_vertical { half_w } else { 0.0 },
958 max_y + if is_horizontal { half_w } else { 0.0 },
959 ),
960 index: Some(self.line_index),
961 level: None,
962 start: Vertex {
963 x: *x1,
964 y: *y1,
965 radius: 0.0,
966 },
967 end: Vertex {
968 x: *x2,
969 y: *y2,
970 radius: 0.0,
971 },
972 width: self.line_width,
973 is_horizontal_line: is_horizontal,
974 is_vertical_line: is_vertical,
975 is_square: false,
976 });
977 }
978 }
979 }
980 if !classified_lines.is_empty() {
981 self.line_chunks.extend(classified_lines);
982 return;
983 }
984 }
985
986 if !has_curves && path.len() == 4 {
988 if let Some(rect) = try_classify_rectangle(&path, self.line_width, self.page_number) {
989 self.line_index += 1;
990 let mut rect = rect;
991 rect.index = Some(self.line_index);
992 self.line_chunks.push(rect);
993 return;
994 }
995 }
996
997 if path.len() >= 2 {
999 let mut art_lines = Vec::new();
1000 let mut min_x = f64::MAX;
1001 let mut min_y = f64::MAX;
1002 let mut max_x = f64::MIN;
1003 let mut max_y = f64::MIN;
1004
1005 for seg in &path {
1006 let (sx, sy, ex, ey) = match seg {
1007 PathSegment::Line { x1, y1, x2, y2 } => (*x1, *y1, *x2, *y2),
1008 PathSegment::Curve { x1, y1, x2, y2, .. } => (*x1, *y1, *x2, *y2),
1009 };
1010 min_x = min_x.min(sx).min(ex);
1011 min_y = min_y.min(sy).min(ey);
1012 max_x = max_x.max(sx).max(ex);
1013 max_y = max_y.max(sy).max(ey);
1014
1015 self.line_index += 1;
1016 art_lines.push(LineChunk {
1017 bbox: BoundingBox::new(
1018 Some(self.page_number),
1019 sx.min(ex),
1020 sy.min(ey),
1021 sx.max(ex),
1022 sy.max(ey),
1023 ),
1024 index: Some(self.line_index),
1025 level: None,
1026 start: Vertex {
1027 x: sx,
1028 y: sy,
1029 radius: 0.0,
1030 },
1031 end: Vertex {
1032 x: ex,
1033 y: ey,
1034 radius: 0.0,
1035 },
1036 width: self.line_width,
1037 is_horizontal_line: false,
1038 is_vertical_line: false,
1039 is_square: false,
1040 });
1041 }
1042
1043 self.line_index += 1;
1044 self.line_art_chunks.push(LineArtChunk {
1045 bbox: BoundingBox::new(Some(self.page_number), min_x, min_y, max_x, max_y),
1046 index: Some(self.line_index),
1047 level: None,
1048 line_chunks: art_lines,
1049 });
1050 }
1051 }
1052
1053 fn transform_point(&self, x: f64, y: f64) -> (f64, f64) {
1056 self.gs_stack.current.ctm.transform_point(x, y)
1057 }
1058
1059 fn active_mcid(&self) -> Option<i64> {
1060 self.mcid_stack.iter().rev().find_map(|&mcid| mcid)
1061 }
1062}
1063
1064fn create_text_chunk(
1070 text_bytes: &[u8],
1071 font: &PdfFont,
1072 state: &mut GraphicsStateStack,
1073 page_number: u32,
1074 chunk_index: &mut usize,
1075 mcid: Option<i64>,
1076) -> Option<TextChunk> {
1077 if text_bytes.is_empty() {
1078 return None;
1079 }
1080
1081 let trm = state.current.text_rendering_matrix();
1082 let start_x = trm.e;
1083 let font_size = trm.font_size_factor();
1084
1085 if font_size < 0.1 {
1086 return None;
1087 }
1088
1089 let mut text = String::new();
1090 let mut total_width = 0.0;
1091 let mut symbol_ends = Vec::new();
1092
1093 let bpc = font.bytes_per_code as usize;
1094 let mut pos = 0;
1095 while pos + bpc <= text_bytes.len() {
1096 let char_code = if bpc == 2 {
1097 ((text_bytes[pos] as u32) << 8) | (text_bytes[pos + 1] as u32)
1098 } else {
1099 text_bytes[pos] as u32
1100 };
1101 pos += bpc;
1102
1103 let decoded = font.decode_char(char_code);
1104 text.push_str(&decoded);
1105
1106 let glyph_w = font.glyph_width(char_code) / 1000.0;
1107 total_width += glyph_w;
1108 symbol_ends.push(start_x + total_width * font_size);
1109
1110 total_width += state.current.text_state.char_spacing / state.current.text_state.font_size;
1111
1112 if decoded == " " {
1113 total_width +=
1114 state.current.text_state.word_spacing / state.current.text_state.font_size;
1115 }
1116 }
1117
1118 let displacement = total_width * state.current.text_state.font_size;
1119 state.current.advance_text(displacement);
1120
1121 if text.is_empty() {
1122 return None;
1123 }
1124
1125 let trm_after = state.current.text_rendering_matrix();
1127
1128 let ascent = font.ascent;
1130 let descent = font.descent;
1131
1132 let trm_before = &trm;
1134
1135 let (x1, x2) = if trm_before.a >= 0.0 && trm_before.c >= 0.0 {
1137 (
1138 trm_before.e + descent * trm_before.c / 1000.0,
1139 trm_after.e + ascent * trm_after.c / 1000.0,
1140 )
1141 } else if trm_before.a < 0.0 && trm_before.c < 0.0 {
1142 (
1143 trm_after.e + ascent * trm_after.c / 1000.0,
1144 trm_before.e + descent * trm_before.c / 1000.0,
1145 )
1146 } else if trm_before.a >= 0.0 {
1147 (
1148 trm_before.e + ascent * trm_before.c / 1000.0,
1149 trm_after.e + descent * trm_after.c / 1000.0,
1150 )
1151 } else {
1152 (
1153 trm_after.e + descent * trm_after.c / 1000.0,
1154 trm_before.e + ascent * trm_before.c / 1000.0,
1155 )
1156 };
1157
1158 let (y1, y2) = if trm_before.d >= 0.0 && trm_before.b >= 0.0 {
1159 (
1160 trm_before.f + descent * trm_before.d / 1000.0,
1161 trm_after.f + ascent * trm_after.d / 1000.0,
1162 )
1163 } else if trm_before.d < 0.0 && trm_before.b < 0.0 {
1164 (
1165 trm_after.f + ascent * trm_after.d / 1000.0,
1166 trm_before.f + descent * trm_before.d / 1000.0,
1167 )
1168 } else if trm_before.d >= 0.0 {
1169 (
1170 trm_after.f + descent * trm_after.d / 1000.0,
1171 trm_before.f + ascent * trm_before.d / 1000.0,
1172 )
1173 } else {
1174 (
1175 trm_before.f + ascent * trm_before.d / 1000.0,
1176 trm_after.f + descent * trm_after.d / 1000.0,
1177 )
1178 };
1179
1180 let bbox = BoundingBox::new(Some(page_number), x1, y1, x2, y2);
1181
1182 let text_format = if state.current.text_state.rise > font_size * 0.1 {
1183 crate::models::enums::TextFormat::Superscript
1184 } else if state.current.text_state.rise < -font_size * 0.1 {
1185 crate::models::enums::TextFormat::Subscript
1186 } else {
1187 crate::models::enums::TextFormat::Normal
1188 };
1189
1190 *chunk_index += 1;
1191
1192 let fc = &state.current.fill_color;
1193 let font_color = format!(
1194 "[{}]",
1195 fc.iter()
1196 .map(|v| {
1197 let f32_val = *v as f32;
1201 let f64_repr = f32_val as f64;
1202 if f32_val.fract() == 0.0 {
1203 format!("{:.1}", f64_repr)
1204 } else {
1205 format!("{}", f64_repr)
1206 }
1207 })
1208 .collect::<Vec<_>>()
1209 .join(", ")
1210 );
1211
1212 Some(TextChunk {
1213 value: text,
1214 bbox,
1215 font_name: font.base_font.clone(),
1216 font_size,
1217 font_weight: font.weight,
1218 italic_angle: font.italic_angle,
1219 font_color,
1220 contrast_ratio: 21.0,
1221 symbol_ends,
1222 text_format,
1223 text_type: crate::models::enums::TextType::Regular,
1224 pdf_layer: crate::models::enums::PdfLayer::Main,
1225 ocg_visible: true,
1226 index: Some(*chunk_index),
1227 page_number: Some(page_number),
1228 level: None,
1229 mcid,
1230 })
1231}
1232
1233#[derive(Debug, Clone)]
1235enum PathSegment {
1236 Line {
1237 x1: f64,
1238 y1: f64,
1239 x2: f64,
1240 y2: f64,
1241 },
1242 #[allow(dead_code)]
1243 Curve {
1244 x1: f64,
1245 y1: f64,
1246 cp1x: f64,
1247 cp1y: f64,
1248 cp2x: f64,
1249 cp2y: f64,
1250 x2: f64,
1251 y2: f64,
1252 },
1253}
1254
1255fn try_classify_rectangle(
1257 segments: &[PathSegment],
1258 _line_width: f64,
1259 page_number: u32,
1260) -> Option<LineChunk> {
1261 let mut min_x = f64::MAX;
1262 let mut min_y = f64::MAX;
1263 let mut max_x = f64::MIN;
1264 let mut max_y = f64::MIN;
1265
1266 for seg in segments {
1267 if let PathSegment::Line { x1, y1, x2, y2 } = seg {
1268 min_x = min_x.min(*x1).min(*x2);
1269 min_y = min_y.min(*y1).min(*y2);
1270 max_x = max_x.max(*x1).max(*x2);
1271 max_y = max_y.max(*y1).max(*y2);
1272 } else {
1273 return None;
1274 }
1275 }
1276
1277 let w = max_x - min_x;
1278 let h = max_y - min_y;
1279
1280 if w < MIN_LINE_WIDTH || h < MIN_LINE_WIDTH {
1281 return None;
1282 }
1283
1284 let is_square = (w - h).abs() / w.max(h) < 0.3;
1285
1286 Some(LineChunk {
1287 bbox: BoundingBox::new(Some(page_number), min_x, min_y, max_x, max_y),
1288 index: None,
1289 level: None,
1290 start: Vertex {
1291 x: min_x,
1292 y: min_y,
1293 radius: 0.0,
1294 },
1295 end: Vertex {
1296 x: max_x,
1297 y: max_y,
1298 radius: 0.0,
1299 },
1300 width: w.min(h),
1301 is_horizontal_line: w > h * LINE_ASPECT_RATIO,
1302 is_vertical_line: h > w * LINE_ASPECT_RATIO,
1303 is_square,
1304 })
1305}
1306
1307fn get_form_matrix(doc: &Document, dict: &Dictionary) -> Matrix {
1309 match dict.get(b"Matrix") {
1310 Ok(obj) => {
1311 let resolved = resolve_obj(doc, obj);
1312 if let Ok(arr) = resolved.as_array() {
1313 let vals: Vec<f64> = arr.iter().filter_map(|o| obj_to_f64(o.clone())).collect();
1314 if vals.len() == 6 {
1315 return Matrix {
1316 a: vals[0],
1317 b: vals[1],
1318 c: vals[2],
1319 d: vals[3],
1320 e: vals[4],
1321 f: vals[5],
1322 };
1323 }
1324 }
1325 Matrix::identity()
1326 }
1327 Err(_) => Matrix::identity(),
1328 }
1329}
1330
1331fn resolve_form_fonts(doc: &Document, resources: &Dictionary) -> FontCache {
1333 let font_dict = match resources.get(b"Font") {
1334 Ok(obj) => {
1335 let resolved = resolve_obj(doc, obj);
1336 match resolved.as_dict() {
1337 Ok(d) => d.clone(),
1338 Err(_) => return FontCache::default(),
1339 }
1340 }
1341 Err(_) => return FontCache::default(),
1342 };
1343
1344 let mut cache = FontCache::default();
1345 for (name_bytes, font_ref) in font_dict.iter() {
1346 let font_name = String::from_utf8_lossy(name_bytes).to_string();
1347 let font_obj = resolve_obj(doc, font_ref);
1348 if let Ok(font_dict) = font_obj.as_dict() {
1349 let font = super::font::resolve_font_dict(doc, &font_name, font_dict);
1350 cache.insert(font_name, font);
1351 }
1352 }
1353 cache
1354}
1355
1356fn extract_string_bytes(obj: &Object) -> Option<Vec<u8>> {
1359 match obj {
1360 Object::String(bytes, _) => Some(bytes.clone()),
1361 _ => None,
1362 }
1363}
1364
1365fn extract_mcid_from_bdc(operands: &[Object]) -> Option<i64> {
1366 if operands.len() < 2 {
1367 return None;
1368 }
1369 match &operands[1] {
1370 Object::Dictionary(dict) => {
1371 if let Ok(Object::Integer(n)) = dict.get(b"MCID") {
1372 return Some(*n);
1373 }
1374 None
1375 }
1376 _ => None,
1377 }
1378}
1379
1380fn obj_to_f64(obj: Object) -> Option<f64> {
1381 match obj {
1382 Object::Integer(i) => Some(i as f64),
1383 Object::Real(f) => Some(f),
1384 _ => None,
1385 }
1386}
1387
1388fn obj_to_name(obj: &Object) -> String {
1389 match obj {
1390 Object::Name(bytes) => String::from_utf8_lossy(bytes).to_string(),
1391 _ => String::new(),
1392 }
1393}
1394
1395fn obj_name_bytes(obj: &Object) -> Option<Vec<u8>> {
1397 match obj {
1398 Object::Name(bytes) => Some(bytes.clone()),
1399 _ => None,
1400 }
1401}
1402
1403fn color_space_components(name: &str) -> u8 {
1404 match name {
1405 "DeviceGray" | "CalGray" | "G" => 1,
1406 "DeviceRGB" | "CalRGB" | "RGB" => 3,
1407 "DeviceCMYK" | "CMYK" => 4,
1408 _ => 3,
1409 }
1410}
1411
1412fn default_color_for_space(components: u8) -> Vec<f64> {
1414 match components {
1415 4 => vec![0.0, 0.0, 0.0, 1.0], 3 => vec![0.0, 0.0, 0.0], _ => vec![0.0], }
1419}
1420
1421fn resolve_obj(doc: &Document, obj: &Object) -> Object {
1422 match obj {
1423 Object::Reference(id) => doc.get_object(*id).cloned().unwrap_or(Object::Null),
1424 other => other.clone(),
1425 }
1426}
1427
1428#[cfg(test)]
1429mod tests {
1430 use super::*;
1431 use lopdf::content::Operation;
1432 use lopdf::{dictionary, Stream};
1433
1434 fn create_test_pdf_with_text() -> Document {
1435 let mut doc = Document::with_version("1.5");
1436 let pages_id = doc.new_object_id();
1437
1438 let font_id = doc.add_object(dictionary! {
1439 "Type" => "Font",
1440 "Subtype" => "Type1",
1441 "BaseFont" => "Helvetica",
1442 });
1443
1444 let resources_id = doc.add_object(dictionary! {
1445 "Font" => dictionary! {
1446 "F1" => font_id,
1447 },
1448 });
1449
1450 let content = Content {
1451 operations: vec![
1452 Operation::new("BT", vec![]),
1453 Operation::new("Tf", vec!["F1".into(), 12.into()]),
1454 Operation::new("Td", vec![100.into(), 700.into()]),
1455 Operation::new("Tj", vec![Object::string_literal("Hello World!")]),
1456 Operation::new("ET", vec![]),
1457 ],
1458 };
1459
1460 let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
1461
1462 let page_id = doc.add_object(dictionary! {
1463 "Type" => "Page",
1464 "Parent" => pages_id,
1465 "Contents" => content_id,
1466 "Resources" => resources_id,
1467 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1468 });
1469
1470 let pages = dictionary! {
1471 "Type" => "Pages",
1472 "Kids" => vec![page_id.into()],
1473 "Count" => 1,
1474 };
1475 doc.objects.insert(pages_id, Object::Dictionary(pages));
1476
1477 let catalog_id = doc.add_object(dictionary! {
1478 "Type" => "Catalog",
1479 "Pages" => pages_id,
1480 });
1481 doc.trailer.set("Root", catalog_id);
1482 doc
1483 }
1484
1485 #[test]
1486 fn test_unified_text_extraction() {
1487 let doc = create_test_pdf_with_text();
1488 let pages = doc.get_pages();
1489 let (&page_num, &page_id) = pages.iter().next().unwrap();
1490
1491 let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1492 assert!(!chunks.text_chunks.is_empty(), "Expected text chunks");
1493 assert!(
1494 chunks.text_chunks[0].value.contains("Hello"),
1495 "Expected 'Hello' in text"
1496 );
1497 assert!(chunks.image_chunks.is_empty(), "No images expected");
1498 }
1499
1500 #[test]
1501 fn test_image_from_do_operator() {
1502 let mut doc = Document::with_version("1.5");
1503 let pages_id = doc.new_object_id();
1504
1505 let img_stream = Stream::new(
1507 dictionary! {
1508 "Type" => "XObject",
1509 "Subtype" => "Image",
1510 "Width" => 200,
1511 "Height" => 100,
1512 "ColorSpace" => "DeviceRGB",
1513 "BitsPerComponent" => 8,
1514 },
1515 vec![0u8; 100],
1516 );
1517 let img_id = doc.add_object(img_stream);
1518
1519 let resources_id = doc.add_object(dictionary! {
1520 "XObject" => dictionary! {
1521 "Im1" => img_id,
1522 },
1523 });
1524
1525 let content = Content {
1527 operations: vec![
1528 Operation::new("q", vec![]),
1529 Operation::new(
1530 "cm",
1531 vec![
1532 Object::Real(200.0), 0.into(),
1534 0.into(),
1535 Object::Real(100.0), Object::Real(72.0), Object::Real(500.0), ],
1539 ),
1540 Operation::new("Do", vec!["Im1".into()]),
1541 Operation::new("Q", vec![]),
1542 ],
1543 };
1544
1545 let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
1546
1547 let page_id = doc.add_object(dictionary! {
1548 "Type" => "Page",
1549 "Parent" => pages_id,
1550 "Contents" => content_id,
1551 "Resources" => resources_id,
1552 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1553 });
1554
1555 let pages = dictionary! {
1556 "Type" => "Pages",
1557 "Kids" => vec![page_id.into()],
1558 "Count" => 1,
1559 };
1560 doc.objects.insert(pages_id, Object::Dictionary(pages));
1561
1562 let catalog_id = doc.add_object(dictionary! {
1563 "Type" => "Catalog",
1564 "Pages" => pages_id,
1565 });
1566 doc.trailer.set("Root", catalog_id);
1567
1568 let pages = doc.get_pages();
1569 let (&page_num, &page_id) = pages.iter().next().unwrap();
1570
1571 let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1572 assert_eq!(chunks.image_chunks.len(), 1, "Expected 1 image chunk");
1573
1574 let img = &chunks.image_chunks[0];
1575 assert!(
1577 (img.bbox.left_x - 72.0).abs() < 1.0,
1578 "Expected left_x ~72, got {}",
1579 img.bbox.left_x
1580 );
1581 assert!(
1582 (img.bbox.bottom_y - 500.0).abs() < 1.0,
1583 "Expected bottom_y ~500, got {}",
1584 img.bbox.bottom_y
1585 );
1586 assert!(
1587 (img.bbox.right_x - 272.0).abs() < 1.0,
1588 "Expected right_x ~272, got {}",
1589 img.bbox.right_x
1590 );
1591 assert!(
1592 (img.bbox.top_y - 600.0).abs() < 1.0,
1593 "Expected top_y ~600, got {}",
1594 img.bbox.top_y
1595 );
1596 }
1597
1598 #[test]
1599 fn test_form_xobject_recursive() {
1600 let mut doc = Document::with_version("1.5");
1601 let pages_id = doc.new_object_id();
1602
1603 let font_id = doc.add_object(dictionary! {
1605 "Type" => "Font",
1606 "Subtype" => "Type1",
1607 "BaseFont" => "Helvetica",
1608 });
1609
1610 let form_content = Content {
1612 operations: vec![
1613 Operation::new("BT", vec![]),
1614 Operation::new("Tf", vec!["F1".into(), 10.into()]),
1615 Operation::new("Td", vec![0.into(), 0.into()]),
1616 Operation::new("Tj", vec![Object::string_literal("Form Text")]),
1617 Operation::new("ET", vec![]),
1618 ],
1619 };
1620
1621 let form_stream = Stream::new(
1622 dictionary! {
1623 "Type" => "XObject",
1624 "Subtype" => "Form",
1625 "BBox" => vec![0.into(), 0.into(), 200.into(), 50.into()],
1626 "Resources" => dictionary! {
1627 "Font" => dictionary! {
1628 "F1" => font_id,
1629 },
1630 },
1631 },
1632 form_content.encode().unwrap(),
1633 );
1634 let form_id = doc.add_object(form_stream);
1635
1636 let resources_id = doc.add_object(dictionary! {
1637 "Font" => dictionary! {
1638 "F1" => font_id,
1639 },
1640 "XObject" => dictionary! {
1641 "Fm1" => form_id,
1642 },
1643 });
1644
1645 let page_content = Content {
1647 operations: vec![
1648 Operation::new("q", vec![]),
1649 Operation::new(
1650 "cm",
1651 vec![
1652 1.into(),
1653 0.into(),
1654 0.into(),
1655 1.into(),
1656 Object::Real(50.0),
1657 Object::Real(400.0),
1658 ],
1659 ),
1660 Operation::new("Do", vec!["Fm1".into()]),
1661 Operation::new("Q", vec![]),
1662 ],
1663 };
1664
1665 let content_id =
1666 doc.add_object(Stream::new(dictionary! {}, page_content.encode().unwrap()));
1667
1668 let page_id = doc.add_object(dictionary! {
1669 "Type" => "Page",
1670 "Parent" => pages_id,
1671 "Contents" => content_id,
1672 "Resources" => resources_id,
1673 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1674 });
1675
1676 let pages = dictionary! {
1677 "Type" => "Pages",
1678 "Kids" => vec![page_id.into()],
1679 "Count" => 1,
1680 };
1681 doc.objects.insert(pages_id, Object::Dictionary(pages));
1682
1683 let catalog_id = doc.add_object(dictionary! {
1684 "Type" => "Catalog",
1685 "Pages" => pages_id,
1686 });
1687 doc.trailer.set("Root", catalog_id);
1688
1689 let pages = doc.get_pages();
1690 let (&page_num, &page_id) = pages.iter().next().unwrap();
1691
1692 let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1693 assert!(
1694 !chunks.text_chunks.is_empty(),
1695 "Expected text from Form XObject"
1696 );
1697 assert!(
1698 chunks.text_chunks[0].value.contains("Form"),
1699 "Expected 'Form' text, got: '{}'",
1700 chunks.text_chunks[0].value
1701 );
1702 }
1703
1704 #[test]
1705 fn test_line_extraction_unified() {
1706 let mut doc = Document::with_version("1.5");
1707 let pages_id = doc.new_object_id();
1708
1709 let content = Content {
1710 operations: vec![
1711 Operation::new("w", vec![Object::Real(1.0)]),
1712 Operation::new("m", vec![72.into(), 400.into()]),
1713 Operation::new("l", vec![500.into(), 400.into()]),
1714 Operation::new("S", vec![]),
1715 ],
1716 };
1717
1718 let content_id = doc.add_object(Stream::new(dictionary! {}, content.encode().unwrap()));
1719
1720 let page_id = doc.add_object(dictionary! {
1721 "Type" => "Page",
1722 "Parent" => pages_id,
1723 "Contents" => content_id,
1724 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
1725 });
1726
1727 let pages = dictionary! {
1728 "Type" => "Pages",
1729 "Kids" => vec![page_id.into()],
1730 "Count" => 1,
1731 };
1732 doc.objects.insert(pages_id, Object::Dictionary(pages));
1733
1734 let catalog_id = doc.add_object(dictionary! {
1735 "Type" => "Catalog",
1736 "Pages" => pages_id,
1737 });
1738 doc.trailer.set("Root", catalog_id);
1739
1740 let pages = doc.get_pages();
1741 let (&page_num, &page_id) = pages.iter().next().unwrap();
1742
1743 let chunks = extract_page_chunks(&doc, page_num, page_id).unwrap();
1744 assert_eq!(chunks.line_chunks.len(), 1, "Expected 1 horizontal line");
1745 assert!(chunks.line_chunks[0].is_horizontal_line);
1746 }
1747}