Skip to main content

folio_cos/
xref.rs

1//! PDF cross-reference table parsing.
2//!
3//! Handles both traditional xref tables and cross-reference streams (PDF 1.5+).
4
5use crate::object::PdfObject;
6use crate::parser;
7use crate::tokenizer::{Token, Tokenizer};
8use folio_core::{FolioError, Result};
9use indexmap::IndexMap;
10
11/// A cross-reference entry for one object.
12#[derive(Debug, Clone, Copy)]
13pub enum XrefEntry {
14    /// Object is in use at the given byte offset.
15    InUse { offset: u64, gen_num: u16 },
16    /// Object has been freed.
17    Free { next_free: u32, gen_num: u16 },
18    /// Object is stored in an object stream (PDF 1.5+).
19    Compressed { stream_obj: u32, index: u32 },
20}
21
22/// Parsed cross-reference table with all entries and the trailer dictionary.
23#[derive(Debug, Clone)]
24pub struct XrefTable {
25    /// Map from object number to xref entry.
26    pub entries: IndexMap<u32, XrefEntry>,
27    /// The trailer dictionary.
28    pub trailer: IndexMap<Vec<u8>, PdfObject>,
29}
30
31/// Find the `startxref` offset from the end of a PDF file.
32pub fn find_startxref(data: &[u8]) -> Result<u64> {
33    let search_start = data.len().saturating_sub(1024);
34    let search_region = &data[search_start..];
35
36    let needle = b"startxref";
37    let pos = search_region
38        .windows(needle.len())
39        .rposition(|w| w == needle)
40        .ok_or_else(|| FolioError::Parse {
41            offset: data.len() as u64,
42            message: "Could not find startxref".into(),
43        })?;
44
45    let after = search_start + pos + needle.len();
46    let mut tokenizer = Tokenizer::new_at(data, after);
47    tokenizer.skip_whitespace_and_comments();
48
49    match tokenizer.next_token()? {
50        Some(Token::Integer(offset)) => Ok(offset as u64),
51        other => Err(FolioError::Parse {
52            offset: after as u64,
53            message: format!("Expected xref offset after startxref, got {:?}", other),
54        }),
55    }
56}
57
58/// Parse a traditional cross-reference table starting at the given offset.
59pub fn parse_xref_table(data: &[u8], offset: u64) -> Result<XrefTable> {
60    let mut tokenizer = Tokenizer::new_at(data, offset as usize);
61
62    match tokenizer.next_token()? {
63        Some(Token::Keyword(ref kw)) if kw == b"xref" => {}
64        _ => {
65            return Err(FolioError::Parse {
66                offset,
67                message: "Expected 'xref' keyword".into(),
68            });
69        }
70    }
71
72    let mut entries = IndexMap::new();
73
74    loop {
75        tokenizer.skip_whitespace_and_comments();
76
77        let saved = tokenizer.pos();
78        match tokenizer.next_token()? {
79            Some(Token::Keyword(ref kw)) if kw == b"trailer" => break,
80            Some(Token::Integer(first_obj)) => {
81                let count = match tokenizer.next_token()? {
82                    Some(Token::Integer(n)) => n as u32,
83                    _ => {
84                        return Err(FolioError::Parse {
85                            offset: tokenizer.pos() as u64,
86                            message: "Expected object count in xref subsection".into(),
87                        });
88                    }
89                };
90
91                for i in 0..count {
92                    tokenizer.skip_whitespace();
93                    let obj_num = first_obj as u32 + i;
94
95                    let entry_offset = match tokenizer.next_token()? {
96                        Some(Token::Integer(n)) => n as u64,
97                        _ => continue,
98                    };
99                    let gen_num = match tokenizer.next_token()? {
100                        Some(Token::Integer(n)) => n as u16,
101                        _ => continue,
102                    };
103                    let in_use = match tokenizer.next_token()? {
104                        Some(Token::Keyword(ref kw)) => kw == b"n",
105                        _ => continue,
106                    };
107
108                    let entry = if in_use {
109                        XrefEntry::InUse {
110                            offset: entry_offset,
111                            gen_num,
112                        }
113                    } else {
114                        XrefEntry::Free {
115                            next_free: entry_offset as u32,
116                            gen_num,
117                        }
118                    };
119
120                    entries.insert(obj_num, entry);
121                }
122            }
123            _ => {
124                tokenizer.set_pos(saved);
125                break;
126            }
127        }
128    }
129
130    let trailer = match parser::parse_object(&mut tokenizer)? {
131        Some(PdfObject::Dict(d)) => d,
132        _ => IndexMap::new(),
133    };
134
135    Ok(XrefTable { entries, trailer })
136}
137
138/// Parse a cross-reference stream and extract entries.
139///
140/// The stream dict serves as both the trailer and the xref data container.
141/// See PDF spec ISO 32000-2:2020 ยง7.5.8.
142fn parse_xref_stream(
143    stream_dict: &IndexMap<Vec<u8>, PdfObject>,
144    stream_data: &[u8],
145) -> Result<IndexMap<u32, XrefEntry>> {
146    // Get /W array: field widths [type_width, field2_width, field3_width]
147    let w_array = stream_dict
148        .get(b"W".as_slice())
149        .and_then(|o| o.as_array())
150        .ok_or_else(|| FolioError::Parse {
151            offset: 0,
152            message: "Xref stream missing /W array".into(),
153        })?;
154
155    if w_array.len() < 3 {
156        return Err(FolioError::Parse {
157            offset: 0,
158            message: format!("Xref stream /W array too short: {} elements", w_array.len()),
159        });
160    }
161
162    let w0 = w_array[0].as_i64().unwrap_or(0) as usize;
163    let w1 = w_array[1].as_i64().unwrap_or(0) as usize;
164    let w2 = w_array[2].as_i64().unwrap_or(0) as usize;
165    let entry_size = w0 + w1 + w2;
166
167    if entry_size == 0 {
168        return Ok(IndexMap::new());
169    }
170
171    // Decode stream data (apply filters)
172    let decoded_data = {
173        let filter_names: Vec<Vec<u8>> = match stream_dict.get(b"Filter".as_slice()) {
174            Some(PdfObject::Name(name)) => vec![name.clone()],
175            Some(PdfObject::Array(arr)) => arr
176                .iter()
177                .filter_map(|o| o.as_name().map(|n| n.to_vec()))
178                .collect(),
179            _ => vec![],
180        };
181
182        if filter_names.is_empty() {
183            stream_data.to_vec()
184        } else {
185            let params_list = get_decode_params(stream_dict, filter_names.len());
186            folio_filters::decode_filter_chain(&filter_names, stream_data, &params_list)?
187        }
188    };
189
190    // Get /Index array: [first_obj count first_obj count ...]
191    // Default is [0 Size]
192    let size = stream_dict
193        .get(b"Size".as_slice())
194        .and_then(|o| o.as_i64())
195        .unwrap_or(0) as u32;
196
197    let index_ranges: Vec<(u32, u32)> = match stream_dict.get(b"Index".as_slice()) {
198        Some(PdfObject::Array(arr)) => {
199            let mut ranges = Vec::new();
200            let mut i = 0;
201            while i + 1 < arr.len() {
202                let first = arr[i].as_i64().unwrap_or(0) as u32;
203                let count = arr[i + 1].as_i64().unwrap_or(0) as u32;
204                ranges.push((first, count));
205                i += 2;
206            }
207            ranges
208        }
209        _ => vec![(0, size)],
210    };
211
212    // Parse entries
213    let mut entries = IndexMap::new();
214    let mut data_pos = 0;
215
216    for (first_obj, count) in &index_ranges {
217        for i in 0..*count {
218            if data_pos + entry_size > decoded_data.len() {
219                break;
220            }
221
222            let obj_num = first_obj + i;
223
224            let type_field = read_field(&decoded_data, data_pos, w0, 1); // default type=1
225            let field2 = read_field(&decoded_data, data_pos + w0, w1, 0);
226            let field3 = read_field(&decoded_data, data_pos + w0 + w1, w2, 0);
227
228            data_pos += entry_size;
229
230            let entry = match type_field {
231                0 => XrefEntry::Free {
232                    next_free: field2 as u32,
233                    gen_num: field3 as u16,
234                },
235                1 => XrefEntry::InUse {
236                    offset: field2,
237                    gen_num: field3 as u16,
238                },
239                2 => XrefEntry::Compressed {
240                    stream_obj: field2 as u32,
241                    index: field3 as u32,
242                },
243                _ => continue, // Unknown type, skip
244            };
245
246            entries.insert(obj_num, entry);
247        }
248    }
249
250    Ok(entries)
251}
252
253/// Read a big-endian integer field of `width` bytes from `data` at `offset`.
254/// If width is 0, returns `default_value`.
255fn read_field(data: &[u8], offset: usize, width: usize, default_value: u64) -> u64 {
256    if width == 0 {
257        return default_value;
258    }
259    let mut value: u64 = 0;
260    for i in 0..width {
261        if offset + i < data.len() {
262            value = (value << 8) | data[offset + i] as u64;
263        }
264    }
265    value
266}
267
268/// Extract DecodeParms for stream filter chain.
269fn get_decode_params(
270    dict: &IndexMap<Vec<u8>, PdfObject>,
271    filter_count: usize,
272) -> Vec<Option<folio_filters::FilterParams>> {
273    match dict.get(b"DecodeParms".as_slice()) {
274        Some(PdfObject::Dict(d)) => {
275            vec![Some(dict_to_filter_params(d)); filter_count.max(1)]
276        }
277        Some(PdfObject::Array(arr)) => arr
278            .iter()
279            .map(|obj| obj.as_dict().map(dict_to_filter_params))
280            .collect(),
281        _ => vec![None; filter_count],
282    }
283}
284
285fn dict_to_filter_params(dict: &IndexMap<Vec<u8>, PdfObject>) -> folio_filters::FilterParams {
286    folio_filters::FilterParams {
287        predictor: dict
288            .get(b"Predictor".as_slice())
289            .and_then(|o| o.as_i64())
290            .unwrap_or(1) as i32,
291        colors: dict
292            .get(b"Colors".as_slice())
293            .and_then(|o| o.as_i64())
294            .unwrap_or(1) as i32,
295        bits_per_component: dict
296            .get(b"BitsPerComponent".as_slice())
297            .and_then(|o| o.as_i64())
298            .unwrap_or(8) as i32,
299        columns: dict
300            .get(b"Columns".as_slice())
301            .and_then(|o| o.as_i64())
302            .unwrap_or(1) as i32,
303        early_change: dict
304            .get(b"EarlyChange".as_slice())
305            .and_then(|o| o.as_i64())
306            .unwrap_or(1) as i32,
307    }
308}
309
310/// Parse all cross-reference tables (following /Prev links for incremental updates).
311pub fn parse_all_xrefs(data: &[u8]) -> Result<XrefTable> {
312    let startxref = find_startxref(data)?;
313    let mut combined_entries = IndexMap::new();
314    let mut final_trailer = IndexMap::new();
315    let mut offset = startxref;
316    let mut visited = std::collections::HashSet::new();
317
318    loop {
319        if visited.contains(&offset) {
320            break;
321        }
322        visited.insert(offset);
323
324        if offset as usize >= data.len() {
325            return Err(FolioError::Parse {
326                offset,
327                message: "Xref offset beyond end of file".into(),
328            });
329        }
330
331        let is_xref_table = data[offset as usize..].starts_with(b"xref");
332
333        if is_xref_table {
334            let table = parse_xref_table(data, offset)?;
335
336            for (num, entry) in table.entries {
337                combined_entries.entry(num).or_insert(entry);
338            }
339
340            if final_trailer.is_empty() {
341                final_trailer = table.trailer.clone();
342            }
343
344            match table.trailer.get(b"Prev".as_slice()) {
345                Some(PdfObject::Integer(prev)) => offset = *prev as u64,
346                _ => break,
347            }
348        } else {
349            // Cross-reference stream
350            match parser::parse_indirect_object_at(data, offset as usize) {
351                Ok((_id, PdfObject::Stream(stream))) => {
352                    if final_trailer.is_empty() {
353                        final_trailer = stream.dict.clone();
354                    }
355
356                    // Decode xref stream entries
357                    match parse_xref_stream(&stream.dict, &stream.data) {
358                        Ok(entries) => {
359                            for (num, entry) in entries {
360                                combined_entries.entry(num).or_insert(entry);
361                            }
362                        }
363                        Err(e) => {
364                            log::warn!("Failed to decode xref stream: {}", e);
365                        }
366                    }
367
368                    match stream.dict.get(b"Prev".as_slice()) {
369                        Some(PdfObject::Integer(prev)) => offset = *prev as u64,
370                        _ => break,
371                    }
372                }
373                _ => break,
374            }
375        }
376    }
377
378    Ok(XrefTable {
379        entries: combined_entries,
380        trailer: final_trailer,
381    })
382}
383
384#[cfg(test)]
385mod tests {
386    use super::*;
387
388    #[test]
389    fn test_find_startxref() {
390        let data = b"%PDF-1.4\n... content ...\nstartxref\n12345\n%%EOF";
391        let offset = find_startxref(data).unwrap();
392        assert_eq!(offset, 12345);
393    }
394
395    #[test]
396    fn test_parse_xref_table() {
397        let data = b"xref\n0 3\n0000000000 65535 f \n0000000009 00000 n \n0000000074 00000 n \ntrailer\n<< /Size 3 /Root 1 0 R >>\nstartxref\n0\n%%EOF";
398        let table = parse_xref_table(data, 0).unwrap();
399        assert_eq!(table.entries.len(), 3);
400
401        match table.entries.get(&1) {
402            Some(XrefEntry::InUse { offset, gen_num }) => {
403                assert_eq!(*offset, 9);
404                assert_eq!(*gen_num, 0);
405            }
406            other => panic!("Expected InUse, got {:?}", other),
407        }
408    }
409
410    #[test]
411    fn test_read_field() {
412        assert_eq!(read_field(&[0x01, 0x02], 0, 2, 0), 0x0102);
413        assert_eq!(read_field(&[0xFF], 0, 1, 0), 255);
414        assert_eq!(read_field(&[], 0, 0, 42), 42); // zero width returns default
415        assert_eq!(read_field(&[0x00, 0x01, 0x00], 0, 3, 0), 256);
416    }
417
418    #[test]
419    fn test_parse_xref_stream_entries() {
420        // Simulate a simple xref stream: 3 entries, W=[1,2,1]
421        // Entry 0: type=0 (free), next=0, gen=255
422        // Entry 1: type=1 (in-use), offset=100, gen=0
423        // Entry 2: type=2 (compressed), stream_obj=5, index=0
424        let stream_data: Vec<u8> = vec![
425            0, 0, 0, 255, // obj 0: free, next=0, gen=255
426            1, 0, 100, 0, // obj 1: in-use, offset=100, gen=0
427            2, 0, 5, 0, // obj 2: compressed, stream=5, index=0
428        ];
429
430        let mut dict = IndexMap::new();
431        dict.insert(
432            b"W".to_vec(),
433            PdfObject::Array(vec![
434                PdfObject::Integer(1),
435                PdfObject::Integer(2),
436                PdfObject::Integer(1),
437            ]),
438        );
439        dict.insert(b"Size".to_vec(), PdfObject::Integer(3));
440
441        let entries = parse_xref_stream(&dict, &stream_data).unwrap();
442        assert_eq!(entries.len(), 3);
443
444        match entries.get(&0) {
445            Some(XrefEntry::Free { next_free, gen_num }) => {
446                assert_eq!(*next_free, 0);
447                assert_eq!(*gen_num, 255);
448            }
449            other => panic!("Expected Free, got {:?}", other),
450        }
451
452        match entries.get(&1) {
453            Some(XrefEntry::InUse { offset, gen_num }) => {
454                assert_eq!(*offset, 100);
455                assert_eq!(*gen_num, 0);
456            }
457            other => panic!("Expected InUse, got {:?}", other),
458        }
459
460        match entries.get(&2) {
461            Some(XrefEntry::Compressed { stream_obj, index }) => {
462                assert_eq!(*stream_obj, 5);
463                assert_eq!(*index, 0);
464            }
465            other => panic!("Expected Compressed, got {:?}", other),
466        }
467    }
468}