1use crate::object::PdfObject;
6use crate::parser;
7use crate::tokenizer::{Token, Tokenizer};
8use folio_core::{FolioError, Result};
9use indexmap::IndexMap;
10
11#[derive(Debug, Clone, Copy)]
13pub enum XrefEntry {
14 InUse { offset: u64, gen_num: u16 },
16 Free { next_free: u32, gen_num: u16 },
18 Compressed { stream_obj: u32, index: u32 },
20}
21
22#[derive(Debug, Clone)]
24pub struct XrefTable {
25 pub entries: IndexMap<u32, XrefEntry>,
27 pub trailer: IndexMap<Vec<u8>, PdfObject>,
29}
30
31pub fn find_startxref(data: &[u8]) -> Result<u64> {
33 let search_start = data.len().saturating_sub(1024);
34 let search_region = &data[search_start..];
35
36 let needle = b"startxref";
37 let pos = search_region
38 .windows(needle.len())
39 .rposition(|w| w == needle)
40 .ok_or_else(|| FolioError::Parse {
41 offset: data.len() as u64,
42 message: "Could not find startxref".into(),
43 })?;
44
45 let after = search_start + pos + needle.len();
46 let mut tokenizer = Tokenizer::new_at(data, after);
47 tokenizer.skip_whitespace_and_comments();
48
49 match tokenizer.next_token()? {
50 Some(Token::Integer(offset)) => Ok(offset as u64),
51 other => Err(FolioError::Parse {
52 offset: after as u64,
53 message: format!("Expected xref offset after startxref, got {:?}", other),
54 }),
55 }
56}
57
58pub fn parse_xref_table(data: &[u8], offset: u64) -> Result<XrefTable> {
60 let mut tokenizer = Tokenizer::new_at(data, offset as usize);
61
62 match tokenizer.next_token()? {
63 Some(Token::Keyword(ref kw)) if kw == b"xref" => {}
64 _ => {
65 return Err(FolioError::Parse {
66 offset,
67 message: "Expected 'xref' keyword".into(),
68 });
69 }
70 }
71
72 let mut entries = IndexMap::new();
73
74 loop {
75 tokenizer.skip_whitespace_and_comments();
76
77 let saved = tokenizer.pos();
78 match tokenizer.next_token()? {
79 Some(Token::Keyword(ref kw)) if kw == b"trailer" => break,
80 Some(Token::Integer(first_obj)) => {
81 let count = match tokenizer.next_token()? {
82 Some(Token::Integer(n)) => n as u32,
83 _ => {
84 return Err(FolioError::Parse {
85 offset: tokenizer.pos() as u64,
86 message: "Expected object count in xref subsection".into(),
87 });
88 }
89 };
90
91 for i in 0..count {
92 tokenizer.skip_whitespace();
93 let obj_num = first_obj as u32 + i;
94
95 let entry_offset = match tokenizer.next_token()? {
96 Some(Token::Integer(n)) => n as u64,
97 _ => continue,
98 };
99 let gen_num = match tokenizer.next_token()? {
100 Some(Token::Integer(n)) => n as u16,
101 _ => continue,
102 };
103 let in_use = match tokenizer.next_token()? {
104 Some(Token::Keyword(ref kw)) => kw == b"n",
105 _ => continue,
106 };
107
108 let entry = if in_use {
109 XrefEntry::InUse {
110 offset: entry_offset,
111 gen_num,
112 }
113 } else {
114 XrefEntry::Free {
115 next_free: entry_offset as u32,
116 gen_num,
117 }
118 };
119
120 entries.insert(obj_num, entry);
121 }
122 }
123 _ => {
124 tokenizer.set_pos(saved);
125 break;
126 }
127 }
128 }
129
130 let trailer = match parser::parse_object(&mut tokenizer)? {
131 Some(PdfObject::Dict(d)) => d,
132 _ => IndexMap::new(),
133 };
134
135 Ok(XrefTable { entries, trailer })
136}
137
138fn parse_xref_stream(
143 stream_dict: &IndexMap<Vec<u8>, PdfObject>,
144 stream_data: &[u8],
145) -> Result<IndexMap<u32, XrefEntry>> {
146 let w_array = stream_dict
148 .get(b"W".as_slice())
149 .and_then(|o| o.as_array())
150 .ok_or_else(|| FolioError::Parse {
151 offset: 0,
152 message: "Xref stream missing /W array".into(),
153 })?;
154
155 if w_array.len() < 3 {
156 return Err(FolioError::Parse {
157 offset: 0,
158 message: format!("Xref stream /W array too short: {} elements", w_array.len()),
159 });
160 }
161
162 let w0 = w_array[0].as_i64().unwrap_or(0) as usize;
163 let w1 = w_array[1].as_i64().unwrap_or(0) as usize;
164 let w2 = w_array[2].as_i64().unwrap_or(0) as usize;
165 let entry_size = w0 + w1 + w2;
166
167 if entry_size == 0 {
168 return Ok(IndexMap::new());
169 }
170
171 let decoded_data = {
173 let filter_names: Vec<Vec<u8>> = match stream_dict.get(b"Filter".as_slice()) {
174 Some(PdfObject::Name(name)) => vec![name.clone()],
175 Some(PdfObject::Array(arr)) => arr
176 .iter()
177 .filter_map(|o| o.as_name().map(|n| n.to_vec()))
178 .collect(),
179 _ => vec![],
180 };
181
182 if filter_names.is_empty() {
183 stream_data.to_vec()
184 } else {
185 let params_list = get_decode_params(stream_dict, filter_names.len());
186 folio_filters::decode_filter_chain(&filter_names, stream_data, ¶ms_list)?
187 }
188 };
189
190 let size = stream_dict
193 .get(b"Size".as_slice())
194 .and_then(|o| o.as_i64())
195 .unwrap_or(0) as u32;
196
197 let index_ranges: Vec<(u32, u32)> = match stream_dict.get(b"Index".as_slice()) {
198 Some(PdfObject::Array(arr)) => {
199 let mut ranges = Vec::new();
200 let mut i = 0;
201 while i + 1 < arr.len() {
202 let first = arr[i].as_i64().unwrap_or(0) as u32;
203 let count = arr[i + 1].as_i64().unwrap_or(0) as u32;
204 ranges.push((first, count));
205 i += 2;
206 }
207 ranges
208 }
209 _ => vec![(0, size)],
210 };
211
212 let mut entries = IndexMap::new();
214 let mut data_pos = 0;
215
216 for (first_obj, count) in &index_ranges {
217 for i in 0..*count {
218 if data_pos + entry_size > decoded_data.len() {
219 break;
220 }
221
222 let obj_num = first_obj + i;
223
224 let type_field = read_field(&decoded_data, data_pos, w0, 1); let field2 = read_field(&decoded_data, data_pos + w0, w1, 0);
226 let field3 = read_field(&decoded_data, data_pos + w0 + w1, w2, 0);
227
228 data_pos += entry_size;
229
230 let entry = match type_field {
231 0 => XrefEntry::Free {
232 next_free: field2 as u32,
233 gen_num: field3 as u16,
234 },
235 1 => XrefEntry::InUse {
236 offset: field2,
237 gen_num: field3 as u16,
238 },
239 2 => XrefEntry::Compressed {
240 stream_obj: field2 as u32,
241 index: field3 as u32,
242 },
243 _ => continue, };
245
246 entries.insert(obj_num, entry);
247 }
248 }
249
250 Ok(entries)
251}
252
253fn read_field(data: &[u8], offset: usize, width: usize, default_value: u64) -> u64 {
256 if width == 0 {
257 return default_value;
258 }
259 let mut value: u64 = 0;
260 for i in 0..width {
261 if offset + i < data.len() {
262 value = (value << 8) | data[offset + i] as u64;
263 }
264 }
265 value
266}
267
268fn get_decode_params(
270 dict: &IndexMap<Vec<u8>, PdfObject>,
271 filter_count: usize,
272) -> Vec<Option<folio_filters::FilterParams>> {
273 match dict.get(b"DecodeParms".as_slice()) {
274 Some(PdfObject::Dict(d)) => {
275 vec![Some(dict_to_filter_params(d)); filter_count.max(1)]
276 }
277 Some(PdfObject::Array(arr)) => arr
278 .iter()
279 .map(|obj| obj.as_dict().map(dict_to_filter_params))
280 .collect(),
281 _ => vec![None; filter_count],
282 }
283}
284
285fn dict_to_filter_params(dict: &IndexMap<Vec<u8>, PdfObject>) -> folio_filters::FilterParams {
286 folio_filters::FilterParams {
287 predictor: dict
288 .get(b"Predictor".as_slice())
289 .and_then(|o| o.as_i64())
290 .unwrap_or(1) as i32,
291 colors: dict
292 .get(b"Colors".as_slice())
293 .and_then(|o| o.as_i64())
294 .unwrap_or(1) as i32,
295 bits_per_component: dict
296 .get(b"BitsPerComponent".as_slice())
297 .and_then(|o| o.as_i64())
298 .unwrap_or(8) as i32,
299 columns: dict
300 .get(b"Columns".as_slice())
301 .and_then(|o| o.as_i64())
302 .unwrap_or(1) as i32,
303 early_change: dict
304 .get(b"EarlyChange".as_slice())
305 .and_then(|o| o.as_i64())
306 .unwrap_or(1) as i32,
307 }
308}
309
310pub fn parse_all_xrefs(data: &[u8]) -> Result<XrefTable> {
312 let startxref = find_startxref(data)?;
313 let mut combined_entries = IndexMap::new();
314 let mut final_trailer = IndexMap::new();
315 let mut offset = startxref;
316 let mut visited = std::collections::HashSet::new();
317
318 loop {
319 if visited.contains(&offset) {
320 break;
321 }
322 visited.insert(offset);
323
324 if offset as usize >= data.len() {
325 return Err(FolioError::Parse {
326 offset,
327 message: "Xref offset beyond end of file".into(),
328 });
329 }
330
331 let is_xref_table = data[offset as usize..].starts_with(b"xref");
332
333 if is_xref_table {
334 let table = parse_xref_table(data, offset)?;
335
336 for (num, entry) in table.entries {
337 combined_entries.entry(num).or_insert(entry);
338 }
339
340 if final_trailer.is_empty() {
341 final_trailer = table.trailer.clone();
342 }
343
344 match table.trailer.get(b"Prev".as_slice()) {
345 Some(PdfObject::Integer(prev)) => offset = *prev as u64,
346 _ => break,
347 }
348 } else {
349 match parser::parse_indirect_object_at(data, offset as usize) {
351 Ok((_id, PdfObject::Stream(stream))) => {
352 if final_trailer.is_empty() {
353 final_trailer = stream.dict.clone();
354 }
355
356 match parse_xref_stream(&stream.dict, &stream.data) {
358 Ok(entries) => {
359 for (num, entry) in entries {
360 combined_entries.entry(num).or_insert(entry);
361 }
362 }
363 Err(e) => {
364 log::warn!("Failed to decode xref stream: {}", e);
365 }
366 }
367
368 match stream.dict.get(b"Prev".as_slice()) {
369 Some(PdfObject::Integer(prev)) => offset = *prev as u64,
370 _ => break,
371 }
372 }
373 _ => break,
374 }
375 }
376 }
377
378 Ok(XrefTable {
379 entries: combined_entries,
380 trailer: final_trailer,
381 })
382}
383
384#[cfg(test)]
385mod tests {
386 use super::*;
387
388 #[test]
389 fn test_find_startxref() {
390 let data = b"%PDF-1.4\n... content ...\nstartxref\n12345\n%%EOF";
391 let offset = find_startxref(data).unwrap();
392 assert_eq!(offset, 12345);
393 }
394
395 #[test]
396 fn test_parse_xref_table() {
397 let data = b"xref\n0 3\n0000000000 65535 f \n0000000009 00000 n \n0000000074 00000 n \ntrailer\n<< /Size 3 /Root 1 0 R >>\nstartxref\n0\n%%EOF";
398 let table = parse_xref_table(data, 0).unwrap();
399 assert_eq!(table.entries.len(), 3);
400
401 match table.entries.get(&1) {
402 Some(XrefEntry::InUse { offset, gen_num }) => {
403 assert_eq!(*offset, 9);
404 assert_eq!(*gen_num, 0);
405 }
406 other => panic!("Expected InUse, got {:?}", other),
407 }
408 }
409
410 #[test]
411 fn test_read_field() {
412 assert_eq!(read_field(&[0x01, 0x02], 0, 2, 0), 0x0102);
413 assert_eq!(read_field(&[0xFF], 0, 1, 0), 255);
414 assert_eq!(read_field(&[], 0, 0, 42), 42); assert_eq!(read_field(&[0x00, 0x01, 0x00], 0, 3, 0), 256);
416 }
417
418 #[test]
419 fn test_parse_xref_stream_entries() {
420 let stream_data: Vec<u8> = vec![
425 0, 0, 0, 255, 1, 0, 100, 0, 2, 0, 5, 0, ];
429
430 let mut dict = IndexMap::new();
431 dict.insert(
432 b"W".to_vec(),
433 PdfObject::Array(vec![
434 PdfObject::Integer(1),
435 PdfObject::Integer(2),
436 PdfObject::Integer(1),
437 ]),
438 );
439 dict.insert(b"Size".to_vec(), PdfObject::Integer(3));
440
441 let entries = parse_xref_stream(&dict, &stream_data).unwrap();
442 assert_eq!(entries.len(), 3);
443
444 match entries.get(&0) {
445 Some(XrefEntry::Free { next_free, gen_num }) => {
446 assert_eq!(*next_free, 0);
447 assert_eq!(*gen_num, 255);
448 }
449 other => panic!("Expected Free, got {:?}", other),
450 }
451
452 match entries.get(&1) {
453 Some(XrefEntry::InUse { offset, gen_num }) => {
454 assert_eq!(*offset, 100);
455 assert_eq!(*gen_num, 0);
456 }
457 other => panic!("Expected InUse, got {:?}", other),
458 }
459
460 match entries.get(&2) {
461 Some(XrefEntry::Compressed { stream_obj, index }) => {
462 assert_eq!(*stream_obj, 5);
463 assert_eq!(*index, 0);
464 }
465 other => panic!("Expected Compressed, got {:?}", other),
466 }
467 }
468}