Skip to main content

pdf_ast/parser/
reference_resolver.rs

1use crate::ast::document::XRefEntry;
2use crate::ast::{EdgeType, NodeId, NodeType, PdfAstGraph, PdfDocument};
3use crate::filters::decode_stream_with_limits;
4use crate::parser::{content_operands, content_stream, object_parser};
5use crate::performance::PerformanceLimits;
6use crate::types::{ObjectId, PdfDictionary, PdfReference, PdfValue, StreamData};
7use log::{debug, info, warn};
8use nom::IResult;
9use std::collections::{HashMap, HashSet, VecDeque};
10use std::io::{BufRead, Seek, SeekFrom};
11
12/// Simple mapping from ObjectId to NodeId for use in parsers
13pub struct ObjectNodeMap {
14    object_to_node: HashMap<ObjectId, NodeId>,
15}
16
17impl Default for ObjectNodeMap {
18    fn default() -> Self {
19        Self::new()
20    }
21}
22
23impl ObjectNodeMap {
24    pub fn new() -> Self {
25        ObjectNodeMap {
26            object_to_node: HashMap::new(),
27        }
28    }
29
30    pub fn insert(&mut self, obj_id: ObjectId, node_id: NodeId) {
31        self.object_to_node.insert(obj_id, node_id);
32    }
33
34    pub fn get_node_id(&self, obj_id: &ObjectId) -> Option<NodeId> {
35        self.object_to_node.get(obj_id).copied()
36    }
37
38    pub fn get_object_node_map(&self) -> ObjectNodeMap {
39        ObjectNodeMap::from_map(self.object_to_node.clone())
40    }
41
42    pub fn from_map(map: HashMap<ObjectId, NodeId>) -> Self {
43        ObjectNodeMap {
44            object_to_node: map,
45        }
46    }
47}
48
49/// Resolves PDF references and builds complete object graph with proper edges
50pub struct ReferenceResolver<R: BufRead + Seek> {
51    reader: R,
52    xref_table: HashMap<ObjectId, u64>,
53    compressed_objects: HashMap<ObjectId, (u32, u32)>,
54    object_to_node: HashMap<ObjectId, NodeId>, // Maps ObjectId to NodeId
55    resolved_objects: HashSet<ObjectId>,
56    pending_references: VecDeque<(NodeId, PdfReference)>, // (source_node, reference)
57    tolerant: bool,
58    limits: PerformanceLimits,
59}
60
61impl<R: BufRead + Seek> ReferenceResolver<R> {
62    pub fn new(mut reader: R, tolerant: bool, limits: PerformanceLimits) -> Result<Self, String> {
63        let xref_table = Self::build_xref_table(&mut reader)?;
64
65        Ok(Self {
66            reader,
67            xref_table,
68            compressed_objects: HashMap::new(),
69            object_to_node: HashMap::new(),
70            resolved_objects: HashSet::new(),
71            pending_references: VecDeque::new(),
72            tolerant,
73            limits,
74        })
75    }
76
77    /// Create resolver using existing document xref information
78    pub fn from_document(
79        reader: R,
80        document: &PdfDocument,
81        tolerant: bool,
82        limits: PerformanceLimits,
83    ) -> Self {
84        let mut xref_table = HashMap::new();
85        let mut compressed_objects = HashMap::new();
86
87        // Convert document xref entries to our format
88        for (obj_id, entry) in &document.xref.entries {
89            match entry {
90                XRefEntry::InUse { offset, .. } => {
91                    xref_table.insert(*obj_id, *offset);
92                }
93                XRefEntry::Compressed {
94                    stream_object,
95                    index,
96                } => {
97                    compressed_objects.insert(*obj_id, (*stream_object, *index));
98                    // Track compressed object references
99                    debug!(
100                        "Object {:?} is compressed in stream {:?} at index {}",
101                        obj_id, stream_object, index
102                    );
103                }
104                _ => {}
105            }
106        }
107
108        info!("Converted {} xref entries from document", xref_table.len());
109
110        Self {
111            reader,
112            xref_table,
113            compressed_objects,
114            object_to_node: HashMap::new(),
115            resolved_objects: HashSet::new(),
116            pending_references: VecDeque::new(),
117            tolerant,
118            limits,
119        }
120    }
121
122    /// Build cross-reference table by scanning the PDF
123    fn build_xref_table(reader: &mut R) -> Result<HashMap<ObjectId, u64>, String> {
124        // Find startxref offset
125        reader
126            .seek(SeekFrom::End(-1024))
127            .map_err(|e| format!("Seek error: {}", e))?;
128
129        let mut buffer = Vec::new();
130        reader
131            .read_to_end(&mut buffer)
132            .map_err(|e| format!("Read error: {}", e))?;
133
134        let content = String::from_utf8_lossy(&buffer);
135
136        if let Some(startxref_pos) = content.rfind("startxref") {
137            let xref_section = &content[startxref_pos..];
138            if let Some(offset_str) = xref_section.lines().nth(1) {
139                if let Ok(xref_offset) = offset_str.trim().parse::<u64>() {
140                    return Self::parse_xref_table(reader, xref_offset);
141                }
142            }
143        }
144
145        // Fallback: scan entire file
146        Self::scan_for_objects(reader)
147    }
148
149    /// Parse xref table at given offset
150    fn parse_xref_table(reader: &mut R, offset: u64) -> Result<HashMap<ObjectId, u64>, String> {
151        reader
152            .seek(SeekFrom::Start(offset))
153            .map_err(|e| format!("Seek error: {}", e))?;
154
155        let mut buffer = Vec::new();
156        reader
157            .read_to_end(&mut buffer)
158            .map_err(|e| format!("Read error: {}", e))?;
159
160        // Try to parse as xref stream first (PDF 1.5+)
161        if buffer.starts_with(b"<<") || buffer.iter().take(20).any(|&b| b.is_ascii_digit()) {
162            // Might be xref stream object
163            if let Ok((_, (_obj_id, PdfValue::Stream(stream)))) =
164                object_parser::parse_indirect_object(&buffer)
165            {
166                return crate::parser::xref::parse_xref_stream(&stream).map(|entries| {
167                    entries
168                        .into_iter()
169                        .filter_map(|(id, entry)| {
170                            if let XRefEntry::InUse { offset, .. } = entry {
171                                Some((id, offset))
172                            } else {
173                                None
174                            }
175                        })
176                        .collect()
177                });
178            }
179        }
180
181        // Parse traditional xref table
182        let mut xref_table = HashMap::new();
183        let content = String::from_utf8_lossy(&buffer);
184
185        if content.starts_with("xref") {
186            let mut lines = content.lines().skip(1); // Skip "xref"
187
188            while let Some(line) = lines.next() {
189                let line = line.trim();
190                if line.is_empty() || line.starts_with("trailer") {
191                    break;
192                }
193
194                // Parse subsection header
195                let parts: Vec<&str> = line.split_whitespace().collect();
196                if parts.len() == 2 {
197                    if let (Ok(start), Ok(count)) =
198                        (parts[0].parse::<u32>(), parts[1].parse::<u32>())
199                    {
200                        for i in 0..count {
201                            if let Some(entry_line) = lines.next() {
202                                let entry_parts: Vec<&str> =
203                                    entry_line.split_whitespace().collect();
204                                if entry_parts.len() >= 3 && entry_parts[2] == "n" {
205                                    if let (Ok(offset), Ok(gen)) = (
206                                        entry_parts[0].parse::<u64>(),
207                                        entry_parts[1].parse::<u16>(),
208                                    ) {
209                                        let obj_id = ObjectId::new(start + i, gen);
210                                        xref_table.insert(obj_id, offset);
211                                    }
212                                }
213                            }
214                        }
215                    }
216                }
217            }
218        }
219
220        Ok(xref_table)
221    }
222
223    /// Scan entire file for object definitions
224    fn scan_for_objects(reader: &mut R) -> Result<HashMap<ObjectId, u64>, String> {
225        reader
226            .seek(SeekFrom::Start(0))
227            .map_err(|e| format!("Seek error: {}", e))?;
228
229        let mut content = Vec::new();
230        reader
231            .read_to_end(&mut content)
232            .map_err(|e| format!("Read error: {}", e))?;
233
234        let mut xref_table = HashMap::new();
235        let mut pos = 0;
236
237        // Find all "n m obj" patterns
238        while pos < content.len() {
239            if let Some(obj_pos) = Self::find_next_object(&content[pos..]) {
240                let absolute_pos = pos + obj_pos;
241
242                // Parse object header
243                if let Ok((_, obj_id)) = Self::parse_object_header(&content[absolute_pos..]) {
244                    xref_table.insert(obj_id, absolute_pos as u64);
245                }
246
247                pos = absolute_pos + 1;
248            } else {
249                break;
250            }
251        }
252
253        info!("Found {} objects by scanning", xref_table.len());
254        Ok(xref_table)
255    }
256
257    fn find_next_object(data: &[u8]) -> Option<usize> {
258        for i in 0..data.len().saturating_sub(10) {
259            // Look for pattern: digit(s) space digit(s) space "obj"
260            if data[i].is_ascii_digit() {
261                let mut j = i;
262                while j < data.len() && data[j].is_ascii_digit() {
263                    j += 1;
264                }
265                if j < data.len() && data[j].is_ascii_whitespace() {
266                    j += 1;
267                    while j < data.len() && data[j].is_ascii_whitespace() {
268                        j += 1;
269                    }
270                    if j < data.len() && data[j].is_ascii_digit() {
271                        let _k = j;
272                        while j < data.len() && data[j].is_ascii_digit() {
273                            j += 1;
274                        }
275                        if j + 4 <= data.len() && &data[j..j + 4] == b" obj" {
276                            return Some(i);
277                        }
278                    }
279                }
280            }
281        }
282        None
283    }
284
285    fn parse_object_header(data: &[u8]) -> IResult<&[u8], ObjectId> {
286        use nom::{
287            bytes::complete::tag,
288            character::complete::{digit1, space1},
289            combinator::map,
290            sequence::tuple,
291        };
292
293        map(
294            tuple((digit1, space1, digit1, space1, tag(b"obj"))),
295            |(num, _, gen, _, _)| {
296                let num = std::str::from_utf8(num).unwrap_or("0").parse().unwrap_or(0);
297                let gen = std::str::from_utf8(gen).unwrap_or("0").parse().unwrap_or(0);
298                ObjectId::new(num, gen)
299            },
300        )(data)
301    }
302
303    /// Resolve all references in the AST with proper edge creation
304    pub fn resolve_references(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
305        // First pass: collect all references from existing nodes
306        let nodes = ast.get_all_nodes();
307        for node in &nodes {
308            self.collect_references_from_node(node.id, &node.value);
309        }
310
311        // Second pass: resolve references and create edges
312        while let Some((source_node, pdf_ref)) = self.pending_references.pop_front() {
313            let obj_id = pdf_ref.id();
314
315            // Check if we already have this object as a node
316            let target_node = if let Some(&existing_node) = self.object_to_node.get(&obj_id) {
317                existing_node
318            } else if !self.resolved_objects.contains(&obj_id) {
319                // Resolve the object
320                match self.resolve_object(obj_id, ast) {
321                    Ok(node_id) => {
322                        self.resolved_objects.insert(obj_id);
323                        self.object_to_node.insert(obj_id, node_id);
324                        node_id
325                    }
326                    Err(e) => {
327                        warn!("Failed to resolve reference {}: {}", obj_id, e);
328                        continue;
329                    }
330                }
331            } else {
332                continue; // Already resolved but not found in map
333            };
334
335            // Create reference edge from source to target
336            ast.add_edge(source_node, target_node, EdgeType::Reference);
337            debug!(
338                "Created reference edge from {:?} to {:?} for object {}",
339                source_node, target_node, obj_id
340            );
341        }
342
343        // Third pass: resolve indirect Length references in streams
344        self.resolve_stream_lengths(ast)?;
345
346        // Fourth pass: build page resource nodes (colorspaces, ICC profiles)
347        self.build_page_resources(ast)?;
348
349        // Fifth pass: build font-related AST nodes
350        self.build_font_resources(ast)?;
351
352        // Sixth pass: build AST from content streams
353        self.build_content_stream_ast(ast)?;
354
355        // Seventh pass: attach JavaScript nodes from action dictionaries
356        self.build_javascript_nodes(ast)?;
357
358        Ok(())
359    }
360
361    fn build_page_resources(&self, ast: &mut PdfAstGraph) -> Result<(), String> {
362        use crate::parser::colorspace::ColorSpaceParser;
363
364        let resolver_map = ObjectNodeMap::from_map(self.object_to_node.clone());
365        let node_ids: Vec<NodeId> = ast.get_all_nodes().iter().map(|n| n.id).collect();
366        for node_id in node_ids {
367            let node = match ast.get_node(node_id) {
368                Some(node) => node,
369                None => continue,
370            };
371            if node.node_type != NodeType::Page {
372                continue;
373            }
374
375            let page_dict = match node.as_dict() {
376                Some(dict) => dict.clone(),
377                None => continue,
378            };
379
380            let resources = match page_dict.get("Resources") {
381                Some(PdfValue::Dictionary(dict)) => Some(dict.clone()),
382                Some(PdfValue::Reference(res_ref)) => self
383                    .object_to_node
384                    .get(&res_ref.id())
385                    .and_then(|res_id| ast.get_node(*res_id))
386                    .and_then(|res_node| res_node.as_dict().cloned()),
387                _ => None,
388            };
389
390            let resources = match resources {
391                Some(res) => res,
392                None => continue,
393            };
394
395            if let Some(PdfValue::Dictionary(colorspaces)) = resources.get("ColorSpace") {
396                for (cs_name, cs_value) in colorspaces.iter() {
397                    let mut parser = ColorSpaceParser::new(ast, &resolver_map);
398                    if let Some(cs_id) = parser.parse_colorspace(cs_value) {
399                        ast.add_edge(node_id, cs_id, EdgeType::Resource);
400                        if let Some(cs_node) = ast.get_node_mut(cs_id) {
401                            cs_node
402                                .metadata
403                                .set_property("resource_name".to_string(), cs_name.to_string());
404                        }
405                    }
406                }
407            }
408        }
409
410        Ok(())
411    }
412
413    fn collect_references_from_node(&mut self, node_id: NodeId, value: &PdfValue) {
414        let mut stack = vec![value];
415        while let Some(current) = stack.pop() {
416            match current {
417                PdfValue::Reference(pdf_ref) => {
418                    self.pending_references.push_back((node_id, *pdf_ref));
419                }
420                PdfValue::Array(array) => {
421                    for item in array.iter() {
422                        stack.push(item);
423                    }
424                }
425                PdfValue::Dictionary(dict) => {
426                    for (_, val) in dict.iter() {
427                        stack.push(val);
428                    }
429                }
430                PdfValue::Stream(stream) => {
431                    for (_, val) in stream.dict.iter() {
432                        stack.push(val);
433                    }
434                }
435                _ => {}
436            }
437        }
438    }
439
440    /// Resolve a specific object and create its node
441    fn resolve_object(
442        &mut self,
443        obj_id: ObjectId,
444        ast: &mut PdfAstGraph,
445    ) -> Result<NodeId, String> {
446        if let Some(&offset) = self.xref_table.get(&obj_id) {
447            // Read and parse the object
448            self.reader
449                .seek(SeekFrom::Start(offset))
450                .map_err(|e| format!("Seek error: {}", e))?;
451
452            let mut buffer = Vec::new();
453            let max_bytes = self.limits.max_object_size_mb * 1024 * 1024;
454            let mut total_read = 0usize;
455            let mut chunk = vec![0u8; 65536];
456            let mut found_endobj = false;
457
458            while total_read < max_bytes {
459                let to_read = std::cmp::min(chunk.len(), max_bytes - total_read);
460                let bytes_read = self
461                    .reader
462                    .read(&mut chunk[..to_read])
463                    .map_err(|e| format!("Read error: {}", e))?;
464                if bytes_read == 0 {
465                    break;
466                }
467                buffer.extend_from_slice(&chunk[..bytes_read]);
468                total_read += bytes_read;
469
470                if buffer.windows(6).any(|w| w == b"endobj") {
471                    found_endobj = true;
472                    break;
473                }
474            }
475
476            if !found_endobj && total_read >= max_bytes && !self.tolerant {
477                return Err(format!(
478                    "Object {} exceeds max size {}MB",
479                    obj_id.number, self.limits.max_object_size_mb
480                ));
481            }
482
483            // Try to parse the object
484            match object_parser::parse_indirect_object(&buffer) {
485                Ok((rest, (parsed_obj_id, value))) => {
486                    if parsed_obj_id != obj_id {
487                        warn!(
488                            "Object ID mismatch: expected {:?}, got {:?}",
489                            obj_id, parsed_obj_id
490                        );
491                    }
492
493                    // Create node with proper type
494                    let node_type = self.determine_node_type(&value, obj_id);
495                    let node_id = ast.create_node(node_type, value);
496
497                    // Add metadata
498                    if let Some(node) = ast.get_node_mut(node_id) {
499                        node.metadata.offset = Some(offset);
500                        node.metadata.size = Some(buffer.len() - rest.len());
501                        node.metadata.properties.insert(
502                            "object_id".to_string(),
503                            format!("{} {} R", obj_id.number, obj_id.generation),
504                        );
505                        if let PdfValue::Stream(stream) = &node.value {
506                            node.metadata
507                                .properties
508                                .insert("stream_length".to_string(), stream.data.len().to_string());
509                            node.metadata.properties.insert(
510                                "stream_filters".to_string(),
511                                stream
512                                    .get_filters()
513                                    .iter()
514                                    .map(|f| f.name())
515                                    .collect::<Vec<_>>()
516                                    .join(","),
517                            );
518                        }
519                    }
520
521                    Ok(node_id)
522                }
523                Err(e) => {
524                    if self.tolerant {
525                        if let Some(recovered) = self.parse_object_value_fallback(&buffer) {
526                            let node_type = self.determine_node_type(&recovered, obj_id);
527                            let node_id = ast.create_node(node_type, recovered);
528                            if let Some(node) = ast.get_node_mut(node_id) {
529                                node.metadata.offset = Some(offset);
530                                node.metadata.size = Some(buffer.len());
531                                node.metadata.warnings.push(
532                                    "Recovered object by parsing value after obj keyword"
533                                        .to_string(),
534                                );
535                                node.metadata.properties.insert(
536                                    "recovery".to_string(),
537                                    "parse_value_after_obj".to_string(),
538                                );
539                            }
540                            return Ok(node_id);
541                        }
542
543                        let node_id = ast.create_node(NodeType::Object(obj_id), PdfValue::Null);
544                        if let Some(node) = ast.get_node_mut(node_id) {
545                            node.metadata.offset = Some(offset);
546                            node.metadata.size = Some(buffer.len());
547                            node.metadata.errors.push(crate::ast::node::ParseError {
548                                code: crate::ast::node::ErrorCode::InvalidSyntax,
549                                message: format!("Failed to parse object: {:?}", e),
550                                offset: Some(offset),
551                                recoverable: true,
552                            });
553                            node.metadata
554                                .warnings
555                                .push("Recovered from parse error".to_string());
556                            node.metadata.properties.insert(
557                                "recovery".to_string(),
558                                "parse_indirect_object_failed".to_string(),
559                            );
560                        }
561                        Ok(node_id)
562                    } else {
563                        Err(format!(
564                            "Failed to parse object at offset {}: {:?}",
565                            offset, e
566                        ))
567                    }
568                }
569            }
570        } else if let Some(&(stream_object, index)) = self.compressed_objects.get(&obj_id) {
571            let (value, meta) = self
572                .resolve_compressed_object(stream_object, index)
573                .map_err(|e| format!("Compressed object {} error: {}", obj_id.number, e))?;
574            let node_type = self.determine_node_type(&value, obj_id);
575            let node_id = ast.create_node(node_type, value);
576
577            if let Some(node) = ast.get_node_mut(node_id) {
578                node.metadata.offset = meta.file_offset;
579                node.metadata.size = meta.object_length;
580                node.metadata.properties.insert(
581                    "object_id".to_string(),
582                    format!("{} {} R", obj_id.number, obj_id.generation),
583                );
584                node.metadata.properties.insert(
585                    "container_stream".to_string(),
586                    format!("{} 0 R", stream_object),
587                );
588                if let Some(offset) = meta.container_offset {
589                    node.metadata
590                        .properties
591                        .insert("container_stream_offset".to_string(), offset.to_string());
592                }
593                if let Some(stream_offset) = meta.object_offset {
594                    node.metadata.properties.insert(
595                        "object_stream_offset".to_string(),
596                        stream_offset.to_string(),
597                    );
598                }
599                if let Some(stream_length) = meta.object_length {
600                    node.metadata.properties.insert(
601                        "object_stream_length".to_string(),
602                        stream_length.to_string(),
603                    );
604                }
605                node.metadata
606                    .properties
607                    .insert("object_stream_index".to_string(), index.to_string());
608            }
609
610            Ok(node_id)
611        } else if self.tolerant {
612            let node_id = ast.create_node(NodeType::Object(obj_id), PdfValue::Null);
613            if let Some(node) = ast.get_node_mut(node_id) {
614                node.metadata.errors.push(crate::ast::node::ParseError {
615                    code: crate::ast::node::ErrorCode::MissingObject,
616                    message: "Object not found in xref table".to_string(),
617                    offset: None,
618                    recoverable: true,
619                });
620                node.metadata
621                    .warnings
622                    .push("Recovered missing object reference".to_string());
623                node.metadata
624                    .properties
625                    .insert("recovery".to_string(), "xref_missing_object".to_string());
626            }
627            Ok(node_id)
628        } else {
629            Err(format!("Object {} not found in xref table", obj_id))
630        }
631    }
632
633    fn parse_object_value_fallback(&self, buffer: &[u8]) -> Option<PdfValue> {
634        let obj_pos = buffer.windows(3).position(|w| w == b"obj")?;
635        let mut pos = obj_pos + 3;
636        while pos < buffer.len() && buffer[pos].is_ascii_whitespace() {
637            pos += 1;
638        }
639        object_parser::parse_value(&buffer[pos..])
640            .ok()
641            .map(|(_, value)| value)
642    }
643
644    fn resolve_compressed_object(
645        &mut self,
646        stream_object: u32,
647        index: u32,
648    ) -> Result<(PdfValue, CompressedObjectMeta), String> {
649        let stream_id = ObjectId::new(stream_object, 0);
650        let stream_offset = self.xref_table.get(&stream_id).copied();
651        let (stream, dict) = self.load_object_stream(stream_object)?;
652        let (value, object_offset, object_length) =
653            self.parse_object_stream_entry(&stream, &dict, index)?;
654
655        Ok((
656            value,
657            CompressedObjectMeta {
658                file_offset: stream_offset,
659                container_offset: stream_offset,
660                object_offset: Some(object_offset as u64),
661                object_length: Some(object_length),
662            },
663        ))
664    }
665
666    fn load_object_stream(
667        &mut self,
668        stream_object: u32,
669    ) -> Result<(Vec<u8>, PdfDictionary), String> {
670        let stream_id = ObjectId::new(stream_object, 0);
671        let offset = self
672            .xref_table
673            .get(&stream_id)
674            .copied()
675            .ok_or_else(|| format!("Object stream {} offset missing", stream_object))?;
676
677        self.reader
678            .seek(SeekFrom::Start(offset))
679            .map_err(|e| format!("Seek error: {}", e))?;
680
681        let mut buffer = Vec::new();
682        let max_bytes = self.limits.max_object_size_mb * 1024 * 1024;
683        let mut total_read = 0usize;
684        let mut chunk = vec![0u8; 65536];
685        let mut found_endobj = false;
686
687        while total_read < max_bytes {
688            let to_read = std::cmp::min(chunk.len(), max_bytes - total_read);
689            let bytes_read = self
690                .reader
691                .read(&mut chunk[..to_read])
692                .map_err(|e| format!("Read error: {}", e))?;
693            if bytes_read == 0 {
694                break;
695            }
696            buffer.extend_from_slice(&chunk[..bytes_read]);
697            total_read += bytes_read;
698
699            if buffer.windows(6).any(|w| w == b"endobj") {
700                found_endobj = true;
701                break;
702            }
703        }
704
705        if !found_endobj && !self.tolerant {
706            return Err("Object stream missing endobj".to_string());
707        }
708
709        let (_, (_obj_id, value)) = object_parser::parse_indirect_object(&buffer)
710            .map_err(|e| format!("Failed to parse object stream: {:?}", e))?;
711        let stream = match value {
712            PdfValue::Stream(stream) => stream,
713            _ => return Err("Object stream is not a stream".to_string()),
714        };
715
716        let filters = stream.get_filters();
717        let raw = stream
718            .raw_data()
719            .ok_or_else(|| "Object stream has no data".to_string())?;
720
721        let decoded = decode_stream_with_limits(
722            raw,
723            &filters,
724            self.limits.max_object_size_mb * 1024 * 1024,
725            self.limits.max_stream_decode_ratio,
726        )
727        .map_err(|e| format!("Failed to decode object stream: {}", e))?;
728
729        Ok((decoded, stream.dict))
730    }
731
732    fn parse_object_stream_entry(
733        &self,
734        data: &[u8],
735        dict: &PdfDictionary,
736        index: u32,
737    ) -> Result<(PdfValue, usize, usize), String> {
738        let n = dict.get("N").and_then(|v| v.as_integer()).unwrap_or(0) as usize;
739        let first = dict.get("First").and_then(|v| v.as_integer()).unwrap_or(0) as usize;
740
741        if n == 0 || first == 0 || first > data.len() {
742            return Err("Invalid object stream header".to_string());
743        }
744        if index as usize >= n {
745            return Err("Object stream index out of range".to_string());
746        }
747
748        let header = &data[..first];
749        let mut pos = 0usize;
750        let mut offsets = Vec::with_capacity(n);
751
752        for _ in 0..n {
753            while pos < header.len() && header[pos].is_ascii_whitespace() {
754                pos += 1;
755            }
756            let num_start = pos;
757            while pos < header.len() && header[pos].is_ascii_digit() {
758                pos += 1;
759            }
760            let _obj_num = std::str::from_utf8(&header[num_start..pos])
761                .unwrap_or("0")
762                .parse::<u32>()
763                .unwrap_or(0);
764            while pos < header.len() && header[pos].is_ascii_whitespace() {
765                pos += 1;
766            }
767            let off_start = pos;
768            while pos < header.len() && header[pos].is_ascii_digit() {
769                pos += 1;
770            }
771            let obj_offset = std::str::from_utf8(&header[off_start..pos])
772                .unwrap_or("0")
773                .parse::<usize>()
774                .unwrap_or(0);
775            offsets.push(obj_offset);
776        }
777
778        if offsets.len() <= index as usize {
779            return Err("Object stream header incomplete".to_string());
780        }
781
782        let start = first + offsets[index as usize];
783        let mut next_offset = data.len();
784        for off in offsets.iter().skip(index as usize + 1) {
785            let candidate = first + *off;
786            if candidate > start && candidate < next_offset {
787                next_offset = candidate;
788            }
789        }
790
791        if start >= data.len() || start >= next_offset {
792            return Err("Invalid object stream offsets".to_string());
793        }
794
795        let slice = &data[start..next_offset];
796        let (_, value) =
797            object_parser::parse_value(slice).map_err(|e| format!("Parse value error: {:?}", e))?;
798        Ok((value, start, next_offset - start))
799    }
800
801    fn determine_node_type(&self, value: &PdfValue, obj_id: ObjectId) -> NodeType {
802        if let PdfValue::Dictionary(dict) = value {
803            if let Some(PdfValue::Name(type_name)) = dict.get("Type") {
804                match type_name.as_str() {
805                    "/Catalog" => return NodeType::Catalog,
806                    "/Pages" => return NodeType::Pages,
807                    "/Page" => return NodeType::Page,
808                    "/Font" => return NodeType::Font,
809                    "/XObject" => {
810                        if let Some(PdfValue::Name(subtype)) = dict.get("Subtype") {
811                            if subtype.as_str() == "/Image" {
812                                return NodeType::Image;
813                            }
814                        }
815                        return NodeType::XObject;
816                    }
817                    "/Annot" => return NodeType::Annotation,
818                    "/Metadata" => return NodeType::Metadata,
819                    _ => {}
820                }
821            }
822        }
823
824        if let PdfValue::Stream(_) = value {
825            return NodeType::ContentStream;
826        }
827
828        NodeType::Object(obj_id)
829    }
830
831    /// Resolve indirect Length references in streams
832    fn resolve_stream_lengths(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
833        let nodes = ast.get_all_nodes();
834        let mut updates = Vec::new();
835
836        for node in nodes {
837            if let PdfValue::Stream(stream) = &node.value {
838                if let Some(PdfValue::Reference(length_ref)) = stream.dict.get("Length") {
839                    // Resolve the length reference
840                    let length_obj_id = length_ref.id();
841
842                    if let Some(&offset) = self.xref_table.get(&length_obj_id) {
843                        self.reader
844                            .seek(SeekFrom::Start(offset))
845                            .map_err(|e| format!("Seek error: {}", e))?;
846
847                        let mut buffer = vec![0u8; 1024];
848                        let bytes_read = self
849                            .reader
850                            .read(&mut buffer)
851                            .map_err(|e| format!("Read error: {}", e))?;
852
853                        if let Ok((_, (_, PdfValue::Integer(length)))) =
854                            object_parser::parse_indirect_object(&buffer[..bytes_read])
855                        {
856                            updates.push((node.id, length as usize));
857                            info!(
858                                "Resolved indirect Length {} for stream in node {:?}",
859                                length, node.id
860                            );
861                        }
862                    }
863                }
864            }
865        }
866
867        // Apply the resolved lengths
868        for (node_id, length) in updates {
869            if let Some(node) = ast.get_node_mut(node_id) {
870                if let PdfValue::Stream(ref mut stream) = node.value {
871                    // Update the Length entry
872                    stream
873                        .dict
874                        .insert("Length", PdfValue::Integer(length as i64));
875
876                    // If we have raw data, validate/truncate to correct length
877                    if let StreamData::Raw(ref mut data) = stream.data {
878                        if data.len() > length {
879                            data.truncate(length);
880                            debug!("Truncated stream data to resolved length {}", length);
881                        }
882                    }
883                }
884            }
885        }
886
887        Ok(())
888    }
889
890    /// Build AST nodes from content streams
891    fn build_content_stream_ast(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
892        let nodes = ast.get_all_nodes();
893        let mut content_streams = Vec::new();
894
895        // Find all content streams
896        for node in nodes {
897            if matches!(node.node_type, NodeType::ContentStream)
898                || (matches!(node.node_type, NodeType::Page) && node.as_dict().is_some())
899            {
900                content_streams.push(node.id);
901            }
902        }
903
904        // Process each content stream
905        for stream_node_id in content_streams {
906            if let Some(node) = ast.get_node(stream_node_id) {
907                let stream_data = if let PdfValue::Stream(stream) = &node.value {
908                    // Decode the stream if needed
909                    // Get stream data and filters
910                    let data = match &stream.data {
911                        crate::types::stream::StreamData::Raw(data) => data,
912                        crate::types::stream::StreamData::Decoded(data) => data,
913                        _ => continue, // Skip lazy streams for now
914                    };
915                    let filters = stream.get_filters();
916                    match decode_stream_with_limits(
917                        data,
918                        &filters,
919                        self.limits.max_object_size_mb * 1024 * 1024,
920                        self.limits.max_stream_decode_ratio,
921                    ) {
922                        Ok(decoded) => decoded,
923                        Err(e) => {
924                            warn!("Failed to decode stream: {}", e);
925                            continue;
926                        }
927                    }
928                } else if let PdfValue::Dictionary(dict) = &node.value {
929                    // Page dictionary - look for Contents
930                    if let Some(PdfValue::Reference(_)) = dict.get("Contents") {
931                        continue; // Will be resolved separately
932                    }
933                    continue;
934                } else {
935                    continue;
936                };
937
938                // Parse content stream operators
939                let mut parser = content_stream::ContentStreamParser::new();
940                match parser.parse(&stream_data) {
941                    Ok(operators) => {
942                        let indexed =
943                            content_operands::parse_content_stream_with_offsets(&stream_data);
944                        if indexed.is_empty() {
945                            // fallback to operator list only
946                            for (i, op) in operators.iter().enumerate() {
947                                let op_node_id = self.create_operator_node(ast, op, i);
948                                ast.add_edge(stream_node_id, op_node_id, EdgeType::Child);
949                            }
950                            info!(
951                                "Created {} operator nodes for stream {:?}",
952                                operators.len(),
953                                stream_node_id
954                            );
955                        } else {
956                            for (i, item) in indexed.iter().enumerate() {
957                                let op_node_id = self.create_operator_node(ast, &item.operator, i);
958                                if let Some(node) = ast.get_node_mut(op_node_id) {
959                                    node.metadata.offset = Some(item.offset as u64);
960                                    node.metadata.properties.insert(
961                                        "stream_local_offset".to_string(),
962                                        item.offset.to_string(),
963                                    );
964                                    node.metadata.properties.insert(
965                                        "content_operator_index".to_string(),
966                                        i.to_string(),
967                                    );
968                                }
969                                ast.add_edge(stream_node_id, op_node_id, EdgeType::Child);
970                            }
971                            info!(
972                                "Created {} operator nodes with offsets for stream {:?}",
973                                indexed.len(),
974                                stream_node_id
975                            );
976                        }
977                    }
978                    Err(e) => {
979                        warn!("Failed to parse content stream: {:?}", e);
980                    }
981                }
982            }
983        }
984
985        Ok(())
986    }
987
988    fn build_javascript_nodes(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
989        let node_ids: Vec<NodeId> = ast.get_all_nodes().iter().map(|n| n.id).collect();
990
991        for node_id in node_ids {
992            let dict = match ast.get_node(node_id).and_then(|node| node.as_dict()) {
993                Some(d) => d.clone(),
994                None => continue,
995            };
996
997            let js_value = dict.get("JS").or_else(|| dict.get("JavaScript"));
998            if js_value.is_none() {
999                continue;
1000            }
1001
1002            let existing_js = ast.get_children(node_id).into_iter().any(|child| {
1003                ast.get_node(child)
1004                    .map(|n| n.node_type == NodeType::JavaScript)
1005                    .unwrap_or(false)
1006            });
1007            if existing_js {
1008                continue;
1009            }
1010
1011            let resolved = match js_value.unwrap() {
1012                PdfValue::Reference(r) => self.load_object_value(r.id()).unwrap_or(PdfValue::Null),
1013                value => value.clone(),
1014            };
1015
1016            let js_id = ast.create_node(NodeType::JavaScript, resolved);
1017            ast.add_edge(node_id, js_id, EdgeType::Child);
1018        }
1019
1020        Ok(())
1021    }
1022
1023    fn build_font_resources(&mut self, ast: &mut PdfAstGraph) -> Result<(), String> {
1024        let nodes = ast.get_all_nodes();
1025        let mut fonts = Vec::new();
1026
1027        for node in nodes {
1028            if matches!(
1029                node.node_type,
1030                NodeType::Font
1031                    | NodeType::Type1Font
1032                    | NodeType::TrueTypeFont
1033                    | NodeType::Type3Font
1034                    | NodeType::CIDFont
1035            ) {
1036                fonts.push(node.id);
1037            }
1038        }
1039
1040        for font_id in fonts {
1041            let dict = match ast.get_node(font_id).and_then(|n| n.as_dict()).cloned() {
1042                Some(d) => d,
1043                None => continue,
1044            };
1045
1046            if let Some(encoding_val) = dict.get("Encoding") {
1047                self.attach_encoding_node(ast, font_id, encoding_val)?;
1048            }
1049
1050            if let Some(to_unicode_val) = dict.get("ToUnicode") {
1051                self.attach_tounicode_node(ast, font_id, to_unicode_val)?;
1052            }
1053
1054            if let Some(cid_info) = dict.get("CIDSystemInfo") {
1055                let cid_id = ast.create_node(NodeType::Metadata, cid_info.clone());
1056                if let Some(node) = ast.get_node_mut(cid_id) {
1057                    node.metadata
1058                        .set_property("metadata_kind".to_string(), "cid_system_info".to_string());
1059                }
1060                ast.add_edge(font_id, cid_id, EdgeType::Child);
1061            }
1062        }
1063
1064        Ok(())
1065    }
1066
1067    fn attach_encoding_node(
1068        &mut self,
1069        ast: &mut PdfAstGraph,
1070        font_id: NodeId,
1071        value: &PdfValue,
1072    ) -> Result<(), String> {
1073        let resolved = match value {
1074            PdfValue::Reference(r) => self.load_object_value(r.id()).unwrap_or(PdfValue::Null),
1075            _ => value.clone(),
1076        };
1077
1078        let encoding_id = ast.create_node(NodeType::Encoding, resolved);
1079        if let Some(node) = ast.get_node_mut(encoding_id) {
1080            node.metadata
1081                .set_property("metadata_kind".to_string(), "font_encoding".to_string());
1082        }
1083        ast.add_edge(font_id, encoding_id, EdgeType::Child);
1084        Ok(())
1085    }
1086
1087    fn attach_tounicode_node(
1088        &mut self,
1089        ast: &mut PdfAstGraph,
1090        font_id: NodeId,
1091        value: &PdfValue,
1092    ) -> Result<(), String> {
1093        let resolved = match value {
1094            PdfValue::Reference(r) => self.load_object_value(r.id()).unwrap_or(PdfValue::Null),
1095            _ => value.clone(),
1096        };
1097
1098        let stream = match resolved {
1099            PdfValue::Stream(stream) => stream,
1100            _ => {
1101                let node_id = ast.create_node(NodeType::ToUnicode, resolved);
1102                ast.add_edge(font_id, node_id, EdgeType::Child);
1103                return Ok(());
1104            }
1105        };
1106
1107        let map = self.object_to_node.clone();
1108        let resolver_map = ObjectNodeMap::from_map(map);
1109        let mut cmap_parser = crate::parser::cmap::CMapParser::new(ast, &resolver_map);
1110        if let Some(node_id) = cmap_parser.parse_tounicode_stream(&stream) {
1111            ast.add_edge(font_id, node_id, EdgeType::Child);
1112        }
1113        Ok(())
1114    }
1115
1116    fn load_object_value(&mut self, obj_id: ObjectId) -> Option<PdfValue> {
1117        let offset = self.xref_table.get(&obj_id).copied()?;
1118        self.reader.seek(SeekFrom::Start(offset)).ok()?;
1119        let mut buffer = Vec::new();
1120        let max_bytes = self.limits.max_object_size_mb * 1024 * 1024;
1121        let mut total_read = 0usize;
1122        let mut chunk = vec![0u8; 65536];
1123        while total_read < max_bytes {
1124            let to_read = std::cmp::min(chunk.len(), max_bytes - total_read);
1125            let bytes_read = self.reader.read(&mut chunk[..to_read]).ok()?;
1126            if bytes_read == 0 {
1127                break;
1128            }
1129            buffer.extend_from_slice(&chunk[..bytes_read]);
1130            total_read += bytes_read;
1131            if buffer.windows(6).any(|w| w == b"endobj") {
1132                break;
1133            }
1134        }
1135
1136        object_parser::parse_indirect_object(&buffer)
1137            .ok()
1138            .map(|(_, (_, value))| value)
1139    }
1140
1141    fn create_operator_node(
1142        &self,
1143        ast: &mut PdfAstGraph,
1144        operator: &content_stream::ContentOperator,
1145        index: usize,
1146    ) -> NodeId {
1147        use content_stream::ContentOperator;
1148
1149        // Create appropriate value for the operator
1150        let value = match operator {
1151            ContentOperator::BeginText => PdfValue::Name(crate::types::PdfName::new("BT")),
1152            ContentOperator::EndText => PdfValue::Name(crate::types::PdfName::new("ET")),
1153            ContentOperator::SetFont(name, size) => {
1154                let mut dict = PdfDictionary::new();
1155                dict.insert("Font", PdfValue::Name(crate::types::PdfName::new(name)));
1156                dict.insert("Size", PdfValue::Real(*size));
1157                PdfValue::Dictionary(dict)
1158            }
1159            ContentOperator::ShowText(text) => {
1160                PdfValue::String(crate::types::PdfString::new_literal(text.clone()))
1161            }
1162            ContentOperator::MoveText(x, y) => {
1163                let mut dict = PdfDictionary::new();
1164                dict.insert("X", PdfValue::Real(*x));
1165                dict.insert("Y", PdfValue::Real(*y));
1166                PdfValue::Dictionary(dict)
1167            }
1168            ContentOperator::PaintXObject(name) => PdfValue::Name(crate::types::PdfName::new(name)),
1169            _ => {
1170                // For other operators, create a simple name value
1171                PdfValue::Name(crate::types::PdfName::new(format!("Op_{}", index)))
1172            }
1173        };
1174
1175        let node_id = ast.create_node(NodeType::ContentOperator, value);
1176
1177        // Add metadata
1178        if let Some(node) = ast.get_node_mut(node_id) {
1179            node.metadata
1180                .properties
1181                .insert("operator_type".to_string(), format!("{:?}", operator));
1182            node.metadata
1183                .properties
1184                .insert("index".to_string(), index.to_string());
1185        }
1186
1187        node_id
1188    }
1189}
1190
1191#[derive(Debug, Clone)]
1192struct CompressedObjectMeta {
1193    file_offset: Option<u64>,
1194    container_offset: Option<u64>,
1195    object_offset: Option<u64>,
1196    object_length: Option<usize>,
1197}
1198
1199#[cfg(test)]
1200mod tests {
1201    use super::*;
1202    use std::io::Cursor;
1203
1204    #[test]
1205    fn test_object_header_parsing() {
1206        let data = b"123 0 obj";
1207        let result = ReferenceResolver::<Cursor<Vec<u8>>>::parse_object_header(data);
1208        assert!(result.is_ok());
1209        let (_, obj_id) = result.unwrap();
1210        assert_eq!(obj_id.number, 123);
1211        assert_eq!(obj_id.generation, 0);
1212    }
1213
1214    #[test]
1215    fn test_find_next_object() {
1216        let data = b"some text 42 0 obj more text";
1217        let pos = ReferenceResolver::<Cursor<Vec<u8>>>::find_next_object(data);
1218        assert_eq!(pos, Some(10)); // Position of "42"
1219    }
1220
1221    #[test]
1222    fn test_reference_collection() {
1223        // Create a small PDF-like buffer to satisfy seek logic
1224        let pdf_data = vec![0u8; 2048]; // At least 1024 bytes so seek doesn't fail
1225        let mut resolver = ReferenceResolver::new(
1226            Cursor::new(pdf_data),
1227            true,
1228            crate::performance::PerformanceLimits::default(),
1229        )
1230        .unwrap();
1231        let mut ast = PdfAstGraph::new();
1232
1233        // Create a node with a reference
1234        let mut dict = PdfDictionary::new();
1235        dict.insert("Ref", PdfValue::Reference(PdfReference::new(5, 0)));
1236        let node_id = ast.create_node(NodeType::Root, PdfValue::Dictionary(dict));
1237
1238        // Collect references
1239        if let Some(node) = ast.get_node(node_id) {
1240            resolver.collect_references_from_node(node_id, &node.value);
1241        }
1242
1243        assert_eq!(resolver.pending_references.len(), 1);
1244    }
1245}