Skip to main content

justpdf_core/xref/
table.rs

1use std::collections::HashMap;
2
3use crate::error::{JustPdfError, Result};
4use crate::object::{self, PdfDict, PdfObject};
5use crate::tokenizer::Tokenizer;
6use crate::tokenizer::reader::PdfReader;
7
8/// A single entry in the cross-reference table.
9#[derive(Debug, Clone)]
10pub enum XrefEntry {
11    /// Free object.
12    Free { next_free: u32, gen_num: u16 },
13    /// In-use object at a byte offset.
14    InUse { offset: u64, gen_num: u16 },
15    /// Compressed object inside an object stream (PDF 1.5+).
16    Compressed {
17        obj_stream_num: u32,
18        index_within: u16,
19    },
20}
21
22/// The complete cross-reference table with merged trailer.
23#[derive(Debug)]
24pub struct Xref {
25    pub entries: HashMap<u32, XrefEntry>,
26    pub trailer: PdfDict,
27}
28
29impl Default for Xref {
30    fn default() -> Self {
31        Self::new()
32    }
33}
34
35impl Xref {
36    pub fn new() -> Self {
37        Self {
38            entries: HashMap::new(),
39            trailer: PdfDict::new(),
40        }
41    }
42
43    /// Number of entries.
44    pub fn len(&self) -> usize {
45        self.entries.len()
46    }
47
48    pub fn is_empty(&self) -> bool {
49        self.entries.is_empty()
50    }
51
52    /// Get entry for an object number.
53    pub fn get(&self, obj_num: u32) -> Option<&XrefEntry> {
54        self.entries.get(&obj_num)
55    }
56
57    /// Get /Size from trailer.
58    pub fn size(&self) -> u32 {
59        self.trailer.get_i64(b"Size").unwrap_or(0) as u32
60    }
61}
62
63/// Parse a traditional xref table at the given offset.
64/// Returns the entries and the trailer dictionary.
65pub fn parse_xref_table(data: &[u8], offset: usize) -> Result<(Vec<(u32, XrefEntry)>, PdfDict)> {
66    let mut reader = PdfReader::new_at(data, offset);
67
68    // Expect "xref"
69    let remaining = reader.remaining();
70    if !remaining.starts_with(b"xref") {
71        return Err(JustPdfError::InvalidXref {
72            offset,
73            detail: "expected 'xref' keyword".into(),
74        });
75    }
76    reader.advance(4);
77    reader.skip_whitespace();
78
79    let mut entries = Vec::new();
80
81    // Parse subsections: each starts with "start_obj count"
82    loop {
83        // Check if we hit "trailer"
84        let remaining = reader.remaining();
85        if remaining.starts_with(b"trailer") {
86            break;
87        }
88        if reader.is_eof() {
89            return Err(JustPdfError::TrailerNotFound);
90        }
91
92        // Read start object number and count
93        let start_obj = read_ascii_number(&mut reader)?;
94        reader.skip_whitespace();
95        let count = read_ascii_number(&mut reader)?;
96        reader.skip_whitespace();
97
98        // Each entry: "nnnnnnnnnn ggggg n \r\n" (20 bytes typical)
99        // Format: 10-digit offset, space, 5-digit gen, space, 'n'/'f', EOL
100        // Be tolerant of different line endings and spacing.
101        for i in 0..count {
102            let entry_start = reader.pos();
103
104            // Read offset (10 digits)
105            let mut offset_buf = Vec::new();
106            while let Some(b) = reader.peek() {
107                if b.is_ascii_digit() {
108                    offset_buf.push(b);
109                    reader.advance(1);
110                } else {
111                    break;
112                }
113            }
114            reader.skip_whitespace();
115
116            // Read generation (5 digits)
117            let mut gen_buf = Vec::new();
118            while let Some(b) = reader.peek() {
119                if b.is_ascii_digit() {
120                    gen_buf.push(b);
121                    reader.advance(1);
122                } else {
123                    break;
124                }
125            }
126            reader.skip_whitespace();
127
128            // Read type char: 'n' or 'f'
129            let type_char = reader.next_byte().unwrap_or(b' ');
130            // Skip trailing whitespace/EOL
131            reader.skip_whitespace();
132
133            let offset_str = std::str::from_utf8(&offset_buf).unwrap_or("0");
134            let gen_str = std::str::from_utf8(&gen_buf).unwrap_or("0");
135            let offset_val: u64 = offset_str.parse().unwrap_or(0);
136            let gen_val: u16 = gen_str.parse().unwrap_or(0);
137
138            let obj_num = start_obj + i;
139
140            let entry = match type_char {
141                b'n' => XrefEntry::InUse {
142                    offset: offset_val,
143                    gen_num: gen_val,
144                },
145                b'f' => XrefEntry::Free {
146                    next_free: offset_val as u32,
147                    gen_num: gen_val,
148                },
149                _ => {
150                    return Err(JustPdfError::InvalidXref {
151                        offset: entry_start,
152                        detail: format!("unknown xref entry type: {:?}", type_char as char),
153                    });
154                }
155            };
156
157            entries.push((obj_num, entry));
158        }
159    }
160
161    // Parse trailer dictionary
162    reader.advance(7); // skip "trailer"
163    reader.skip_whitespace();
164
165    let mut tokenizer = Tokenizer::new_at(data, reader.pos());
166    let trailer_obj = object::parse_object(&mut tokenizer)?;
167
168    let trailer = match trailer_obj {
169        PdfObject::Dict(d) => d,
170        _ => {
171            return Err(JustPdfError::TrailerNotFound);
172        }
173    };
174
175    Ok((entries, trailer))
176}
177
178/// Read a decimal number from current position.
179fn read_ascii_number(reader: &mut PdfReader<'_>) -> Result<u32> {
180    let start = reader.pos();
181    let mut digits = Vec::new();
182    while let Some(b) = reader.peek() {
183        if b.is_ascii_digit() {
184            digits.push(b);
185            reader.advance(1);
186        } else {
187            break;
188        }
189    }
190    if digits.is_empty() {
191        return Err(JustPdfError::InvalidXref {
192            offset: start,
193            detail: "expected number".into(),
194        });
195    }
196    let s = std::str::from_utf8(&digits).unwrap();
197    s.parse::<u32>().map_err(|_| JustPdfError::InvalidXref {
198        offset: start,
199        detail: format!("invalid number: {s}"),
200    })
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    #[test]
208    fn test_parse_xref_table() {
209        let xref_data = b"xref\n\
210            0 3\n\
211            0000000000 65535 f \r\n\
212            0000000100 00000 n \r\n\
213            0000000200 00000 n \r\n\
214            trailer\n\
215            << /Size 3 /Root 1 0 R >>";
216
217        let (entries, trailer) = parse_xref_table(xref_data, 0).unwrap();
218
219        assert_eq!(entries.len(), 3);
220
221        // Entry 0: free
222        match &entries[0] {
223            (
224                0,
225                XrefEntry::Free {
226                    next_free: 0,
227                    gen_num: 65535,
228                },
229            ) => {}
230            other => panic!("unexpected entry 0: {other:?}"),
231        }
232
233        // Entry 1: in use at offset 100
234        match &entries[1] {
235            (
236                1,
237                XrefEntry::InUse {
238                    offset: 100,
239                    gen_num: 0,
240                },
241            ) => {}
242            other => panic!("unexpected entry 1: {other:?}"),
243        }
244
245        // Entry 2: in use at offset 200
246        match &entries[2] {
247            (
248                2,
249                XrefEntry::InUse {
250                    offset: 200,
251                    gen_num: 0,
252                },
253            ) => {}
254            other => panic!("unexpected entry 2: {other:?}"),
255        }
256
257        assert_eq!(trailer.get_i64(b"Size"), Some(3));
258    }
259
260    #[test]
261    fn test_parse_xref_table_multiple_subsections() {
262        let xref_data = b"xref\n\
263            0 1\n\
264            0000000000 65535 f \r\n\
265            3 2\n\
266            0000000300 00000 n \r\n\
267            0000000400 00000 n \r\n\
268            trailer\n\
269            << /Size 5 >>";
270
271        let (entries, _trailer) = parse_xref_table(xref_data, 0).unwrap();
272        assert_eq!(entries.len(), 3);
273
274        let obj_nums: Vec<u32> = entries.iter().map(|(n, _)| *n).collect();
275        assert_eq!(obj_nums, vec![0, 3, 4]);
276    }
277}