1use crate::ast::document::XRefEntry;
2use crate::ast::{EdgeType, NodeId, NodeType, PdfAstGraph, PdfDocument};
3use crate::filters::decode_stream_with_limits;
4use crate::parser::{content_operands, content_stream, object_parser};
5use crate::performance::PerformanceLimits;
6use crate::types::{ObjectId, PdfDictionary, PdfReference, PdfValue, StreamData};
7use log::{debug, info, warn};
8use nom::IResult;
9use std::collections::{HashMap, HashSet, VecDeque};
10use std::io::{BufRead, Seek, SeekFrom};
11
12pub struct ObjectNodeMap {
14 object_to_node: HashMap<ObjectId, NodeId>,
15}
16
17impl Default for ObjectNodeMap {
18 fn default() -> Self {
19 Self::new()
20 }
21}
22
23impl ObjectNodeMap {
24 pub fn new() -> Self {
25 ObjectNodeMap {
26 object_to_node: HashMap::new(),
27 }
28 }
29
30 pub fn insert(&mut self, obj_id: ObjectId, node_id: NodeId) {
31 self.object_to_node.insert(obj_id, node_id);
32 }
33
34 pub fn get_node_id(&self, obj_id: &ObjectId) -> Option<NodeId> {
35 self.object_to_node.get(obj_id).copied()
36 }
37
38 pub fn get_object_node_map(&self) -> ObjectNodeMap {
39 ObjectNodeMap::from_map(self.object_to_node.clone())
40 }
41
42 pub fn from_map(map: HashMap<ObjectId, NodeId>) -> Self {
43 ObjectNodeMap {
44 object_to_node: map,
45 }
46 }
47}
48
49pub struct ReferenceResolver<R: BufRead + Seek> {
51 reader: R,
52 xref_table: HashMap<ObjectId, u64>,
53 compressed_objects: HashMap<ObjectId, (u32, u32)>,
54 object_to_node: HashMap<ObjectId, NodeId>, resolved_objects: HashSet<ObjectId>,
56 pending_references: VecDeque<(NodeId, PdfReference)>, tolerant: bool,
58 limits: PerformanceLimits,
59}
60
61impl<R: BufRead + Seek> ReferenceResolver<R> {
62 pub fn new(mut reader: R, tolerant: bool, limits: PerformanceLimits) -> Result<Self, String> {
63 let xref_table = Self::build_xref_table(&mut reader)?;
64
65 Ok(Self {
66 reader,
67 xref_table,
68 compressed_objects: HashMap::new(),
69 object_to_node: HashMap::new(),
70 resolved_objects: HashSet::new(),
71 pending_references: VecDeque::new(),
72 tolerant,
73 limits,
74 })
75 }
76
77 pub fn from_document(
79 reader: R,
80 document: &PdfDocument,
81 tolerant: bool,
82 limits: PerformanceLimits,
83 ) -> Self {
84 let mut xref_table = HashMap::new();
85 let mut compressed_objects = HashMap::new();
86
87 for (obj_id, entry) in &document.xref.entries {
89 match entry {
90 XRefEntry::InUse { offset, .. } => {
91 xref_table.insert(*obj_id, *offset);
92 }
93 XRefEntry::Compressed {
94 stream_object,
95 index,
96 } => {
97 compressed_objects.insert(*obj_id, (*stream_object, *index));
98 debug!(
100 "Object {:?} is compressed in stream {:?} at index {}",
101 obj_id, stream_object, index
102 );
103 }
104 _ => {}
105 }
106 }
107
108 info!("Converted {} xref entries from document", xref_table.len());
109
110 Self {
111 reader,
112 xref_table,
113 compressed_objects,
114 object_to_node: HashMap::new(),
115 resolved_objects: HashSet::new(),
116 pending_references: VecDeque::new(),
117 tolerant,
118 limits,
119 }
120 }
121
122 fn build_xref_table(reader: &mut R) -> Result<HashMap<ObjectId, u64>, String> {
124 reader
126 .seek(SeekFrom::End(-1024))
127 .map_err(|e| format!("Seek error: {}", e))?;
128
129 let mut buffer = Vec::new();
130 reader
131 .read_to_end(&mut buffer)
132 .map_err(|e| format!("Read error: {}", e))?;
133
134 let content = String::from_utf8_lossy(&buffer);
135
136 if let Some(startxref_pos) = content.rfind("startxref") {
137 let xref_section = &content[startxref_pos..];
138 if let Some(offset_str) = xref_section.lines().nth(1) {
139 if let Ok(xref_offset) = offset_str.trim().parse::<u64>() {
140 return Self::parse_xref_table(reader, xref_offset);
141 }
142 }
143 }
144
145 Self::scan_for_objects(reader)
147 }
148
149 fn parse_xref_table(reader: &mut R, offset: u64) -> Result<HashMap<ObjectId, u64>, String> {
151 reader
152 .seek(SeekFrom::Start(offset))
153 .map_err(|e| format!("Seek error: {}", e))?;
154
155 let mut buffer = Vec::new();
156 reader
157 .read_to_end(&mut buffer)
158 .map_err(|e| format!("Read error: {}", e))?;
159
160 if buffer.starts_with(b"<<") || buffer.iter().take(20).any(|&b| b.is_ascii_digit()) {
162 if let Ok((_, (_obj_id, PdfValue::Stream(stream)))) =
164 object_parser::parse_indirect_object(&buffer)
165 {
166 return crate::parser::xref::parse_xref_stream(&stream).map(|entries| {
167 entries
168 .into_iter()
169 .filter_map(|(id, entry)| {
170 if let XRefEntry::InUse { offset, .. } = entry {
171 Some((id, offset))
172 } else {
173 None
174 }
175 })
176 .collect()
177 });
178 }
179 }
180
181 let mut xref_table = HashMap::new();
183 let content = String::from_utf8_lossy(&buffer);
184
185 if content.starts_with("xref") {
186 let mut lines = content.lines().skip(1); while let Some(line) = lines.next() {
189 let line = line.trim();
190 if line.is_empty() || line.starts_with("trailer") {
191 break;
192 }
193
194 let parts: Vec<&str> = line.split_whitespace().collect();
196 if parts.len() == 2 {
197 if let (Ok(start), Ok(count)) =
198 (parts[0].parse::<u32>(), parts[1].parse::<u32>())
199 {
200 for i in 0..count {
201 if let Some(entry_line) = lines.next() {
202 let entry_parts: Vec<&str> =
203 entry_line.split_whitespace().collect();
204 if entry_parts.len() >= 3 && entry_parts[2] == "n" {
205 if let (Ok(offset), Ok(gen)) = (
206 entry_parts[0].parse::<u64>(),
207 entry_parts[1].parse::<u16>(),
208 ) {
209 let obj_id = ObjectId::new(start + i, gen);
210 xref_table.insert(obj_id, offset);
211 }
212 }
213 }
214 }
215 }
216 }
217 }
218 }
219
220 Ok(xref_table)
221 }
222
223 fn scan_for_objects(reader: &mut R) -> Result<HashMap<ObjectId, u64>, String> {
225 reader
226 .seek(SeekFrom::Start(0))
227 .map_err(|e| format!("Seek error: {}", e))?;
228
229 let mut content = Vec::new();
230 reader
231 .read_to_end(&mut content)
232 .map_err(|e| format!("Read error: {}", e))?;
233
234 let mut xref_table = HashMap::new();
235 let mut pos = 0;
236
237 while pos < content.len() {
239 if let Some(obj_pos) = Self::find_next_object(&content[pos..]) {
240 let absolute_pos = pos + obj_pos;
241
242 if let Ok((_, obj_id)) = Self::parse_object_header(&content[absolute_pos..]) {
244 xref_table.insert(obj_id, absolute_pos as u64);
245 }
246
247 pos = absolute_pos + 1;
248 } else {
249 break;
250 }
251 }
252
253 info!("Found {} objects by scanning", xref_table.len());
254 Ok(xref_table)
255 }
256
257 fn find_next_object(data: &[u8]) -> Option<usize> {
258 for i in 0..data.len().saturating_sub(10) {
259 if data[i].is_ascii_digit() {
261 let mut j = i;
262 while j < data.len() && data[j].is_ascii_digit() {
263 j += 1;
264 }
265 if j < data.len() && data[j].is_ascii_whitespace() {
266 j += 1;
267 while j < data.len() && data[j].is_ascii_whitespace() {
268 j += 1;
269 }
270 if j < data.len() && data[j].is_ascii_digit() {
271 let _k = j;
272 while j < data.len() && data[j].is_ascii_digit() {
273 j += 1;
274 }
275 if j + 4 <= data.len() && &data[j..j + 4] == b" obj" {
276 return Some(i);
277 }
278 }
279 }
280 }
281 }
282 None
283 }
284
285 fn parse_object_header(data: &[u8]) -> IResult<&[u8], ObjectId> {
286 use nom::{
287 bytes::complete::tag,
288 character::complete::{digit1, space1},
289 combinator::map,
290 sequence::tuple,
291 };
292
293 map(
294 tuple((digit1, space1, digit1, space1, tag(b"obj"))),
295 |(num, _, gen, _, _)| {
296 let num = std::str::from_utf8(num).unwrap_or("0").parse().unwrap_or(0);
297 let gen = std::str::from_utf8(gen).unwrap_or("0").parse().unwrap_or(0);
298 ObjectId::new(num, gen)
299 },
300 )(data)
301 }
302
303 pub fn resolve_references(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
305 let nodes = ast.get_all_nodes();
307 for node in &nodes {
308 self.collect_references_from_node(node.id, &node.value);
309 }
310
311 while let Some((source_node, pdf_ref)) = self.pending_references.pop_front() {
313 let obj_id = pdf_ref.id();
314
315 let target_node = if let Some(&existing_node) = self.object_to_node.get(&obj_id) {
317 existing_node
318 } else if !self.resolved_objects.contains(&obj_id) {
319 match self.resolve_object(obj_id, ast) {
321 Ok(node_id) => {
322 self.resolved_objects.insert(obj_id);
323 self.object_to_node.insert(obj_id, node_id);
324 node_id
325 }
326 Err(e) => {
327 warn!("Failed to resolve reference {}: {}", obj_id, e);
328 continue;
329 }
330 }
331 } else {
332 continue; };
334
335 ast.add_edge(source_node, target_node, EdgeType::Reference);
337 debug!(
338 "Created reference edge from {:?} to {:?} for object {}",
339 source_node, target_node, obj_id
340 );
341 }
342
343 self.resolve_stream_lengths(ast)?;
345
346 self.build_page_resources(ast)?;
348
349 self.build_font_resources(ast)?;
351
352 self.build_content_stream_ast(ast)?;
354
355 self.build_javascript_nodes(ast)?;
357
358 Ok(())
359 }
360
361 fn build_page_resources(&self, ast: &mut PdfAstGraph) -> Result<(), String> {
362 use crate::parser::colorspace::ColorSpaceParser;
363
364 let resolver_map = ObjectNodeMap::from_map(self.object_to_node.clone());
365 let node_ids: Vec<NodeId> = ast.get_all_nodes().iter().map(|n| n.id).collect();
366 for node_id in node_ids {
367 let node = match ast.get_node(node_id) {
368 Some(node) => node,
369 None => continue,
370 };
371 if node.node_type != NodeType::Page {
372 continue;
373 }
374
375 let page_dict = match node.as_dict() {
376 Some(dict) => dict.clone(),
377 None => continue,
378 };
379
380 let resources = match page_dict.get("Resources") {
381 Some(PdfValue::Dictionary(dict)) => Some(dict.clone()),
382 Some(PdfValue::Reference(res_ref)) => self
383 .object_to_node
384 .get(&res_ref.id())
385 .and_then(|res_id| ast.get_node(*res_id))
386 .and_then(|res_node| res_node.as_dict().cloned()),
387 _ => None,
388 };
389
390 let resources = match resources {
391 Some(res) => res,
392 None => continue,
393 };
394
395 if let Some(PdfValue::Dictionary(colorspaces)) = resources.get("ColorSpace") {
396 for (cs_name, cs_value) in colorspaces.iter() {
397 let mut parser = ColorSpaceParser::new(ast, &resolver_map);
398 if let Some(cs_id) = parser.parse_colorspace(cs_value) {
399 ast.add_edge(node_id, cs_id, EdgeType::Resource);
400 if let Some(cs_node) = ast.get_node_mut(cs_id) {
401 cs_node
402 .metadata
403 .set_property("resource_name".to_string(), cs_name.to_string());
404 }
405 }
406 }
407 }
408 }
409
410 Ok(())
411 }
412
413 fn collect_references_from_node(&mut self, node_id: NodeId, value: &PdfValue) {
414 let mut stack = vec![value];
415 while let Some(current) = stack.pop() {
416 match current {
417 PdfValue::Reference(pdf_ref) => {
418 self.pending_references.push_back((node_id, *pdf_ref));
419 }
420 PdfValue::Array(array) => {
421 for item in array.iter() {
422 stack.push(item);
423 }
424 }
425 PdfValue::Dictionary(dict) => {
426 for (_, val) in dict.iter() {
427 stack.push(val);
428 }
429 }
430 PdfValue::Stream(stream) => {
431 for (_, val) in stream.dict.iter() {
432 stack.push(val);
433 }
434 }
435 _ => {}
436 }
437 }
438 }
439
440 fn resolve_object(
442 &mut self,
443 obj_id: ObjectId,
444 ast: &mut PdfAstGraph,
445 ) -> Result<NodeId, String> {
446 if let Some(&offset) = self.xref_table.get(&obj_id) {
447 self.reader
449 .seek(SeekFrom::Start(offset))
450 .map_err(|e| format!("Seek error: {}", e))?;
451
452 let mut buffer = Vec::new();
453 let max_bytes = self.limits.max_object_size_mb * 1024 * 1024;
454 let mut total_read = 0usize;
455 let mut chunk = vec![0u8; 65536];
456 let mut found_endobj = false;
457
458 while total_read < max_bytes {
459 let to_read = std::cmp::min(chunk.len(), max_bytes - total_read);
460 let bytes_read = self
461 .reader
462 .read(&mut chunk[..to_read])
463 .map_err(|e| format!("Read error: {}", e))?;
464 if bytes_read == 0 {
465 break;
466 }
467 buffer.extend_from_slice(&chunk[..bytes_read]);
468 total_read += bytes_read;
469
470 if buffer.windows(6).any(|w| w == b"endobj") {
471 found_endobj = true;
472 break;
473 }
474 }
475
476 if !found_endobj && total_read >= max_bytes && !self.tolerant {
477 return Err(format!(
478 "Object {} exceeds max size {}MB",
479 obj_id.number, self.limits.max_object_size_mb
480 ));
481 }
482
483 match object_parser::parse_indirect_object(&buffer) {
485 Ok((rest, (parsed_obj_id, value))) => {
486 if parsed_obj_id != obj_id {
487 warn!(
488 "Object ID mismatch: expected {:?}, got {:?}",
489 obj_id, parsed_obj_id
490 );
491 }
492
493 let node_type = self.determine_node_type(&value, obj_id);
495 let node_id = ast.create_node(node_type, value);
496
497 if let Some(node) = ast.get_node_mut(node_id) {
499 node.metadata.offset = Some(offset);
500 node.metadata.size = Some(buffer.len() - rest.len());
501 node.metadata.properties.insert(
502 "object_id".to_string(),
503 format!("{} {} R", obj_id.number, obj_id.generation),
504 );
505 if let PdfValue::Stream(stream) = &node.value {
506 node.metadata
507 .properties
508 .insert("stream_length".to_string(), stream.data.len().to_string());
509 node.metadata.properties.insert(
510 "stream_filters".to_string(),
511 stream
512 .get_filters()
513 .iter()
514 .map(|f| f.name())
515 .collect::<Vec<_>>()
516 .join(","),
517 );
518 }
519 }
520
521 Ok(node_id)
522 }
523 Err(e) => {
524 if self.tolerant {
525 if let Some(recovered) = self.parse_object_value_fallback(&buffer) {
526 let node_type = self.determine_node_type(&recovered, obj_id);
527 let node_id = ast.create_node(node_type, recovered);
528 if let Some(node) = ast.get_node_mut(node_id) {
529 node.metadata.offset = Some(offset);
530 node.metadata.size = Some(buffer.len());
531 node.metadata.warnings.push(
532 "Recovered object by parsing value after obj keyword"
533 .to_string(),
534 );
535 node.metadata.properties.insert(
536 "recovery".to_string(),
537 "parse_value_after_obj".to_string(),
538 );
539 }
540 return Ok(node_id);
541 }
542
543 let node_id = ast.create_node(NodeType::Object(obj_id), PdfValue::Null);
544 if let Some(node) = ast.get_node_mut(node_id) {
545 node.metadata.offset = Some(offset);
546 node.metadata.size = Some(buffer.len());
547 node.metadata.errors.push(crate::ast::node::ParseError {
548 code: crate::ast::node::ErrorCode::InvalidSyntax,
549 message: format!("Failed to parse object: {:?}", e),
550 offset: Some(offset),
551 recoverable: true,
552 });
553 node.metadata
554 .warnings
555 .push("Recovered from parse error".to_string());
556 node.metadata.properties.insert(
557 "recovery".to_string(),
558 "parse_indirect_object_failed".to_string(),
559 );
560 }
561 Ok(node_id)
562 } else {
563 Err(format!(
564 "Failed to parse object at offset {}: {:?}",
565 offset, e
566 ))
567 }
568 }
569 }
570 } else if let Some(&(stream_object, index)) = self.compressed_objects.get(&obj_id) {
571 let (value, meta) = self
572 .resolve_compressed_object(stream_object, index)
573 .map_err(|e| format!("Compressed object {} error: {}", obj_id.number, e))?;
574 let node_type = self.determine_node_type(&value, obj_id);
575 let node_id = ast.create_node(node_type, value);
576
577 if let Some(node) = ast.get_node_mut(node_id) {
578 node.metadata.offset = meta.file_offset;
579 node.metadata.size = meta.object_length;
580 node.metadata.properties.insert(
581 "object_id".to_string(),
582 format!("{} {} R", obj_id.number, obj_id.generation),
583 );
584 node.metadata.properties.insert(
585 "container_stream".to_string(),
586 format!("{} 0 R", stream_object),
587 );
588 if let Some(offset) = meta.container_offset {
589 node.metadata
590 .properties
591 .insert("container_stream_offset".to_string(), offset.to_string());
592 }
593 if let Some(stream_offset) = meta.object_offset {
594 node.metadata.properties.insert(
595 "object_stream_offset".to_string(),
596 stream_offset.to_string(),
597 );
598 }
599 if let Some(stream_length) = meta.object_length {
600 node.metadata.properties.insert(
601 "object_stream_length".to_string(),
602 stream_length.to_string(),
603 );
604 }
605 node.metadata
606 .properties
607 .insert("object_stream_index".to_string(), index.to_string());
608 }
609
610 Ok(node_id)
611 } else if self.tolerant {
612 let node_id = ast.create_node(NodeType::Object(obj_id), PdfValue::Null);
613 if let Some(node) = ast.get_node_mut(node_id) {
614 node.metadata.errors.push(crate::ast::node::ParseError {
615 code: crate::ast::node::ErrorCode::MissingObject,
616 message: "Object not found in xref table".to_string(),
617 offset: None,
618 recoverable: true,
619 });
620 node.metadata
621 .warnings
622 .push("Recovered missing object reference".to_string());
623 node.metadata
624 .properties
625 .insert("recovery".to_string(), "xref_missing_object".to_string());
626 }
627 Ok(node_id)
628 } else {
629 Err(format!("Object {} not found in xref table", obj_id))
630 }
631 }
632
633 fn parse_object_value_fallback(&self, buffer: &[u8]) -> Option<PdfValue> {
634 let obj_pos = buffer.windows(3).position(|w| w == b"obj")?;
635 let mut pos = obj_pos + 3;
636 while pos < buffer.len() && buffer[pos].is_ascii_whitespace() {
637 pos += 1;
638 }
639 object_parser::parse_value(&buffer[pos..])
640 .ok()
641 .map(|(_, value)| value)
642 }
643
644 fn resolve_compressed_object(
645 &mut self,
646 stream_object: u32,
647 index: u32,
648 ) -> Result<(PdfValue, CompressedObjectMeta), String> {
649 let stream_id = ObjectId::new(stream_object, 0);
650 let stream_offset = self.xref_table.get(&stream_id).copied();
651 let (stream, dict) = self.load_object_stream(stream_object)?;
652 let (value, object_offset, object_length) =
653 self.parse_object_stream_entry(&stream, &dict, index)?;
654
655 Ok((
656 value,
657 CompressedObjectMeta {
658 file_offset: stream_offset,
659 container_offset: stream_offset,
660 object_offset: Some(object_offset as u64),
661 object_length: Some(object_length),
662 },
663 ))
664 }
665
666 fn load_object_stream(
667 &mut self,
668 stream_object: u32,
669 ) -> Result<(Vec<u8>, PdfDictionary), String> {
670 let stream_id = ObjectId::new(stream_object, 0);
671 let offset = self
672 .xref_table
673 .get(&stream_id)
674 .copied()
675 .ok_or_else(|| format!("Object stream {} offset missing", stream_object))?;
676
677 self.reader
678 .seek(SeekFrom::Start(offset))
679 .map_err(|e| format!("Seek error: {}", e))?;
680
681 let mut buffer = Vec::new();
682 let max_bytes = self.limits.max_object_size_mb * 1024 * 1024;
683 let mut total_read = 0usize;
684 let mut chunk = vec![0u8; 65536];
685 let mut found_endobj = false;
686
687 while total_read < max_bytes {
688 let to_read = std::cmp::min(chunk.len(), max_bytes - total_read);
689 let bytes_read = self
690 .reader
691 .read(&mut chunk[..to_read])
692 .map_err(|e| format!("Read error: {}", e))?;
693 if bytes_read == 0 {
694 break;
695 }
696 buffer.extend_from_slice(&chunk[..bytes_read]);
697 total_read += bytes_read;
698
699 if buffer.windows(6).any(|w| w == b"endobj") {
700 found_endobj = true;
701 break;
702 }
703 }
704
705 if !found_endobj && !self.tolerant {
706 return Err("Object stream missing endobj".to_string());
707 }
708
709 let (_, (_obj_id, value)) = object_parser::parse_indirect_object(&buffer)
710 .map_err(|e| format!("Failed to parse object stream: {:?}", e))?;
711 let stream = match value {
712 PdfValue::Stream(stream) => stream,
713 _ => return Err("Object stream is not a stream".to_string()),
714 };
715
716 let filters = stream.get_filters();
717 let raw = stream
718 .raw_data()
719 .ok_or_else(|| "Object stream has no data".to_string())?;
720
721 let decoded = decode_stream_with_limits(
722 raw,
723 &filters,
724 self.limits.max_object_size_mb * 1024 * 1024,
725 self.limits.max_stream_decode_ratio,
726 )
727 .map_err(|e| format!("Failed to decode object stream: {}", e))?;
728
729 Ok((decoded, stream.dict))
730 }
731
732 fn parse_object_stream_entry(
733 &self,
734 data: &[u8],
735 dict: &PdfDictionary,
736 index: u32,
737 ) -> Result<(PdfValue, usize, usize), String> {
738 let n = dict.get("N").and_then(|v| v.as_integer()).unwrap_or(0) as usize;
739 let first = dict.get("First").and_then(|v| v.as_integer()).unwrap_or(0) as usize;
740
741 if n == 0 || first == 0 || first > data.len() {
742 return Err("Invalid object stream header".to_string());
743 }
744 if index as usize >= n {
745 return Err("Object stream index out of range".to_string());
746 }
747
748 let header = &data[..first];
749 let mut pos = 0usize;
750 let mut offsets = Vec::with_capacity(n);
751
752 for _ in 0..n {
753 while pos < header.len() && header[pos].is_ascii_whitespace() {
754 pos += 1;
755 }
756 let num_start = pos;
757 while pos < header.len() && header[pos].is_ascii_digit() {
758 pos += 1;
759 }
760 let _obj_num = std::str::from_utf8(&header[num_start..pos])
761 .unwrap_or("0")
762 .parse::<u32>()
763 .unwrap_or(0);
764 while pos < header.len() && header[pos].is_ascii_whitespace() {
765 pos += 1;
766 }
767 let off_start = pos;
768 while pos < header.len() && header[pos].is_ascii_digit() {
769 pos += 1;
770 }
771 let obj_offset = std::str::from_utf8(&header[off_start..pos])
772 .unwrap_or("0")
773 .parse::<usize>()
774 .unwrap_or(0);
775 offsets.push(obj_offset);
776 }
777
778 if offsets.len() <= index as usize {
779 return Err("Object stream header incomplete".to_string());
780 }
781
782 let start = first + offsets[index as usize];
783 let mut next_offset = data.len();
784 for off in offsets.iter().skip(index as usize + 1) {
785 let candidate = first + *off;
786 if candidate > start && candidate < next_offset {
787 next_offset = candidate;
788 }
789 }
790
791 if start >= data.len() || start >= next_offset {
792 return Err("Invalid object stream offsets".to_string());
793 }
794
795 let slice = &data[start..next_offset];
796 let (_, value) =
797 object_parser::parse_value(slice).map_err(|e| format!("Parse value error: {:?}", e))?;
798 Ok((value, start, next_offset - start))
799 }
800
801 fn determine_node_type(&self, value: &PdfValue, obj_id: ObjectId) -> NodeType {
802 if let PdfValue::Dictionary(dict) = value {
803 if let Some(PdfValue::Name(type_name)) = dict.get("Type") {
804 match type_name.as_str() {
805 "/Catalog" => return NodeType::Catalog,
806 "/Pages" => return NodeType::Pages,
807 "/Page" => return NodeType::Page,
808 "/Font" => return NodeType::Font,
809 "/XObject" => {
810 if let Some(PdfValue::Name(subtype)) = dict.get("Subtype") {
811 if subtype.as_str() == "/Image" {
812 return NodeType::Image;
813 }
814 }
815 return NodeType::XObject;
816 }
817 "/Annot" => return NodeType::Annotation,
818 "/Metadata" => return NodeType::Metadata,
819 _ => {}
820 }
821 }
822 }
823
824 if let PdfValue::Stream(_) = value {
825 return NodeType::ContentStream;
826 }
827
828 NodeType::Object(obj_id)
829 }
830
831 fn resolve_stream_lengths(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
833 let nodes = ast.get_all_nodes();
834 let mut updates = Vec::new();
835
836 for node in nodes {
837 if let PdfValue::Stream(stream) = &node.value {
838 if let Some(PdfValue::Reference(length_ref)) = stream.dict.get("Length") {
839 let length_obj_id = length_ref.id();
841
842 if let Some(&offset) = self.xref_table.get(&length_obj_id) {
843 self.reader
844 .seek(SeekFrom::Start(offset))
845 .map_err(|e| format!("Seek error: {}", e))?;
846
847 let mut buffer = vec![0u8; 1024];
848 let bytes_read = self
849 .reader
850 .read(&mut buffer)
851 .map_err(|e| format!("Read error: {}", e))?;
852
853 if let Ok((_, (_, PdfValue::Integer(length)))) =
854 object_parser::parse_indirect_object(&buffer[..bytes_read])
855 {
856 updates.push((node.id, length as usize));
857 info!(
858 "Resolved indirect Length {} for stream in node {:?}",
859 length, node.id
860 );
861 }
862 }
863 }
864 }
865 }
866
867 for (node_id, length) in updates {
869 if let Some(node) = ast.get_node_mut(node_id) {
870 if let PdfValue::Stream(ref mut stream) = node.value {
871 stream
873 .dict
874 .insert("Length", PdfValue::Integer(length as i64));
875
876 if let StreamData::Raw(ref mut data) = stream.data {
878 if data.len() > length {
879 data.truncate(length);
880 debug!("Truncated stream data to resolved length {}", length);
881 }
882 }
883 }
884 }
885 }
886
887 Ok(())
888 }
889
890 fn build_content_stream_ast(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
892 let nodes = ast.get_all_nodes();
893 let mut content_streams = Vec::new();
894
895 for node in nodes {
897 if matches!(node.node_type, NodeType::ContentStream)
898 || (matches!(node.node_type, NodeType::Page) && node.as_dict().is_some())
899 {
900 content_streams.push(node.id);
901 }
902 }
903
904 for stream_node_id in content_streams {
906 if let Some(node) = ast.get_node(stream_node_id) {
907 let stream_data = if let PdfValue::Stream(stream) = &node.value {
908 let data = match &stream.data {
911 crate::types::stream::StreamData::Raw(data) => data,
912 crate::types::stream::StreamData::Decoded(data) => data,
913 _ => continue, };
915 let filters = stream.get_filters();
916 match decode_stream_with_limits(
917 data,
918 &filters,
919 self.limits.max_object_size_mb * 1024 * 1024,
920 self.limits.max_stream_decode_ratio,
921 ) {
922 Ok(decoded) => decoded,
923 Err(e) => {
924 warn!("Failed to decode stream: {}", e);
925 continue;
926 }
927 }
928 } else if let PdfValue::Dictionary(dict) = &node.value {
929 if let Some(PdfValue::Reference(_)) = dict.get("Contents") {
931 continue; }
933 continue;
934 } else {
935 continue;
936 };
937
938 let mut parser = content_stream::ContentStreamParser::new();
940 match parser.parse(&stream_data) {
941 Ok(operators) => {
942 let indexed =
943 content_operands::parse_content_stream_with_offsets(&stream_data);
944 if indexed.is_empty() {
945 for (i, op) in operators.iter().enumerate() {
947 let op_node_id = self.create_operator_node(ast, op, i);
948 ast.add_edge(stream_node_id, op_node_id, EdgeType::Child);
949 }
950 info!(
951 "Created {} operator nodes for stream {:?}",
952 operators.len(),
953 stream_node_id
954 );
955 } else {
956 for (i, item) in indexed.iter().enumerate() {
957 let op_node_id = self.create_operator_node(ast, &item.operator, i);
958 if let Some(node) = ast.get_node_mut(op_node_id) {
959 node.metadata.offset = Some(item.offset as u64);
960 node.metadata.properties.insert(
961 "stream_local_offset".to_string(),
962 item.offset.to_string(),
963 );
964 node.metadata.properties.insert(
965 "content_operator_index".to_string(),
966 i.to_string(),
967 );
968 }
969 ast.add_edge(stream_node_id, op_node_id, EdgeType::Child);
970 }
971 info!(
972 "Created {} operator nodes with offsets for stream {:?}",
973 indexed.len(),
974 stream_node_id
975 );
976 }
977 }
978 Err(e) => {
979 warn!("Failed to parse content stream: {:?}", e);
980 }
981 }
982 }
983 }
984
985 Ok(())
986 }
987
988 fn build_javascript_nodes(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
989 let node_ids: Vec<NodeId> = ast.get_all_nodes().iter().map(|n| n.id).collect();
990
991 for node_id in node_ids {
992 let dict = match ast.get_node(node_id).and_then(|node| node.as_dict()) {
993 Some(d) => d.clone(),
994 None => continue,
995 };
996
997 let js_value = dict.get("JS").or_else(|| dict.get("JavaScript"));
998 if js_value.is_none() {
999 continue;
1000 }
1001
1002 let existing_js = ast.get_children(node_id).into_iter().any(|child| {
1003 ast.get_node(child)
1004 .map(|n| n.node_type == NodeType::JavaScript)
1005 .unwrap_or(false)
1006 });
1007 if existing_js {
1008 continue;
1009 }
1010
1011 let resolved = match js_value.unwrap() {
1012 PdfValue::Reference(r) => self.load_object_value(r.id()).unwrap_or(PdfValue::Null),
1013 value => value.clone(),
1014 };
1015
1016 let js_id = ast.create_node(NodeType::JavaScript, resolved);
1017 ast.add_edge(node_id, js_id, EdgeType::Child);
1018 }
1019
1020 Ok(())
1021 }
1022
1023 fn build_font_resources(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
1024 let nodes = ast.get_all_nodes();
1025 let mut fonts = Vec::new();
1026
1027 for node in nodes {
1028 if matches!(
1029 node.node_type,
1030 NodeType::Font
1031 | NodeType::Type1Font
1032 | NodeType::TrueTypeFont
1033 | NodeType::Type3Font
1034 | NodeType::CIDFont
1035 ) {
1036 fonts.push(node.id);
1037 }
1038 }
1039
1040 for font_id in fonts {
1041 let dict = match ast.get_node(font_id).and_then(|n| n.as_dict()).cloned() {
1042 Some(d) => d,
1043 None => continue,
1044 };
1045
1046 if let Some(encoding_val) = dict.get("Encoding") {
1047 self.attach_encoding_node(ast, font_id, encoding_val)?;
1048 }
1049
1050 if let Some(to_unicode_val) = dict.get("ToUnicode") {
1051 self.attach_tounicode_node(ast, font_id, to_unicode_val)?;
1052 }
1053
1054 if let Some(cid_info) = dict.get("CIDSystemInfo") {
1055 let cid_id = ast.create_node(NodeType::Metadata, cid_info.clone());
1056 if let Some(node) = ast.get_node_mut(cid_id) {
1057 node.metadata
1058 .set_property("metadata_kind".to_string(), "cid_system_info".to_string());
1059 }
1060 ast.add_edge(font_id, cid_id, EdgeType::Child);
1061 }
1062 }
1063
1064 Ok(())
1065 }
1066
1067 fn attach_encoding_node(
1068 &mut self,
1069 ast: &mut PdfAstGraph,
1070 font_id: NodeId,
1071 value: &PdfValue,
1072 ) -> Result<(), String> {
1073 let resolved = match value {
1074 PdfValue::Reference(r) => self.load_object_value(r.id()).unwrap_or(PdfValue::Null),
1075 _ => value.clone(),
1076 };
1077
1078 let encoding_id = ast.create_node(NodeType::Encoding, resolved);
1079 if let Some(node) = ast.get_node_mut(encoding_id) {
1080 node.metadata
1081 .set_property("metadata_kind".to_string(), "font_encoding".to_string());
1082 }
1083 ast.add_edge(font_id, encoding_id, EdgeType::Child);
1084 Ok(())
1085 }
1086
1087 fn attach_tounicode_node(
1088 &mut self,
1089 ast: &mut PdfAstGraph,
1090 font_id: NodeId,
1091 value: &PdfValue,
1092 ) -> Result<(), String> {
1093 let resolved = match value {
1094 PdfValue::Reference(r) => self.load_object_value(r.id()).unwrap_or(PdfValue::Null),
1095 _ => value.clone(),
1096 };
1097
1098 let stream = match resolved {
1099 PdfValue::Stream(stream) => stream,
1100 _ => {
1101 let node_id = ast.create_node(NodeType::ToUnicode, resolved);
1102 ast.add_edge(font_id, node_id, EdgeType::Child);
1103 return Ok(());
1104 }
1105 };
1106
1107 let map = self.object_to_node.clone();
1108 let resolver_map = ObjectNodeMap::from_map(map);
1109 let mut cmap_parser = crate::parser::cmap::CMapParser::new(ast, &resolver_map);
1110 if let Some(node_id) = cmap_parser.parse_tounicode_stream(&stream) {
1111 ast.add_edge(font_id, node_id, EdgeType::Child);
1112 }
1113 Ok(())
1114 }
1115
1116 fn load_object_value(&mut self, obj_id: ObjectId) -> Option<PdfValue> {
1117 let offset = self.xref_table.get(&obj_id).copied()?;
1118 self.reader.seek(SeekFrom::Start(offset)).ok()?;
1119 let mut buffer = Vec::new();
1120 let max_bytes = self.limits.max_object_size_mb * 1024 * 1024;
1121 let mut total_read = 0usize;
1122 let mut chunk = vec![0u8; 65536];
1123 while total_read < max_bytes {
1124 let to_read = std::cmp::min(chunk.len(), max_bytes - total_read);
1125 let bytes_read = self.reader.read(&mut chunk[..to_read]).ok()?;
1126 if bytes_read == 0 {
1127 break;
1128 }
1129 buffer.extend_from_slice(&chunk[..bytes_read]);
1130 total_read += bytes_read;
1131 if buffer.windows(6).any(|w| w == b"endobj") {
1132 break;
1133 }
1134 }
1135
1136 object_parser::parse_indirect_object(&buffer)
1137 .ok()
1138 .map(|(_, (_, value))| value)
1139 }
1140
1141 fn create_operator_node(
1142 &self,
1143 ast: &mut PdfAstGraph,
1144 operator: &content_stream::ContentOperator,
1145 index: usize,
1146 ) -> NodeId {
1147 use content_stream::ContentOperator;
1148
1149 let value = match operator {
1151 ContentOperator::BeginText => PdfValue::Name(crate::types::PdfName::new("BT")),
1152 ContentOperator::EndText => PdfValue::Name(crate::types::PdfName::new("ET")),
1153 ContentOperator::SetFont(name, size) => {
1154 let mut dict = PdfDictionary::new();
1155 dict.insert("Font", PdfValue::Name(crate::types::PdfName::new(name)));
1156 dict.insert("Size", PdfValue::Real(*size));
1157 PdfValue::Dictionary(dict)
1158 }
1159 ContentOperator::ShowText(text) => {
1160 PdfValue::String(crate::types::PdfString::new_literal(text.clone()))
1161 }
1162 ContentOperator::MoveText(x, y) => {
1163 let mut dict = PdfDictionary::new();
1164 dict.insert("X", PdfValue::Real(*x));
1165 dict.insert("Y", PdfValue::Real(*y));
1166 PdfValue::Dictionary(dict)
1167 }
1168 ContentOperator::PaintXObject(name) => PdfValue::Name(crate::types::PdfName::new(name)),
1169 _ => {
1170 PdfValue::Name(crate::types::PdfName::new(format!("Op_{}", index)))
1172 }
1173 };
1174
1175 let node_id = ast.create_node(NodeType::ContentOperator, value);
1176
1177 if let Some(node) = ast.get_node_mut(node_id) {
1179 node.metadata
1180 .properties
1181 .insert("operator_type".to_string(), format!("{:?}", operator));
1182 node.metadata
1183 .properties
1184 .insert("index".to_string(), index.to_string());
1185 }
1186
1187 node_id
1188 }
1189}
1190
1191#[derive(Debug, Clone)]
1192struct CompressedObjectMeta {
1193 file_offset: Option<u64>,
1194 container_offset: Option<u64>,
1195 object_offset: Option<u64>,
1196 object_length: Option<usize>,
1197}
1198
1199#[cfg(test)]
1200mod tests {
1201 use super::*;
1202 use std::io::Cursor;
1203
1204 #[test]
1205 fn test_object_header_parsing() {
1206 let data = b"123 0 obj";
1207 let result = ReferenceResolver::<Cursor<Vec<u8>>>::parse_object_header(data);
1208 assert!(result.is_ok());
1209 let (_, obj_id) = result.unwrap();
1210 assert_eq!(obj_id.number, 123);
1211 assert_eq!(obj_id.generation, 0);
1212 }
1213
1214 #[test]
1215 fn test_find_next_object() {
1216 let data = b"some text 42 0 obj more text";
1217 let pos = ReferenceResolver::<Cursor<Vec<u8>>>::find_next_object(data);
1218 assert_eq!(pos, Some(10)); }
1220
1221 #[test]
1222 fn test_reference_collection() {
1223 let pdf_data = vec![0u8; 2048]; let mut resolver = ReferenceResolver::new(
1226 Cursor::new(pdf_data),
1227 true,
1228 crate::performance::PerformanceLimits::default(),
1229 )
1230 .unwrap();
1231 let mut ast = PdfAstGraph::new();
1232
1233 let mut dict = PdfDictionary::new();
1235 dict.insert("Ref", PdfValue::Reference(PdfReference::new(5, 0)));
1236 let node_id = ast.create_node(NodeType::Root, PdfValue::Dictionary(dict));
1237
1238 if let Some(node) = ast.get_node(node_id) {
1240 resolver.collect_references_from_node(node_id, &node.value);
1241 }
1242
1243 assert_eq!(resolver.pending_references.len(), 1);
1244 }
1245}