Skip to main content

pdf_syntax/
xref.rs

1//! Reading and querying the xref table of a PDF file.
2
3use crate::crypto::{DecryptionError, DecryptionTarget, Decryptor, get};
4use crate::data::Data;
5use crate::metadata::Metadata;
6use crate::object::Name;
7use crate::object::ObjectIdentifier;
8use crate::object::Stream;
9use crate::object::dict::keys::{
10    AUTHOR, CREATION_DATE, CREATOR, ENCRYPT, FIRST, ID, INDEX, INFO, KEYWORDS, MOD_DATE, N,
11    OCPROPERTIES, PAGES, PREV, PRODUCER, ROOT, SIZE, SUBJECT, TITLE, TYPE, VERSION, W, XREF_STM,
12};
13use crate::object::indirect::IndirectObject;
14use crate::object::{Array, MaybeRef};
15use crate::object::{DateTime, Dict};
16use crate::object::{Object, ObjectLike};
17use crate::pdf::{PdfLoadLimits, PdfVersion};
18use crate::reader::Reader;
19use crate::reader::{Readable, ReaderContext, ReaderExt};
20use crate::sync::{Arc, FxHashMap, RwLock, RwLockExt};
21use crate::{PdfData, object};
22use alloc::vec;
23use alloc::vec::Vec;
24use core::cmp::max;
25use core::iter;
26use core::ops::Deref;
27use log::{error, warn};
28
29pub(crate) const XREF_ENTRY_LEN: usize = 20;
30
31#[derive(Debug, Copy, Clone)]
32pub(crate) enum XRefError {
33    Unknown,
34    Encryption(DecryptionError),
35}
36
37/// Parse the "root" xref from the PDF.
38pub(crate) fn root_xref(
39    data: PdfData,
40    password: &[u8],
41    limits: PdfLoadLimits,
42) -> Result<XRef, XRefError> {
43    let mut xref_map = FxHashMap::default();
44    let xref_pos = find_last_xref_pos(data.as_ref()).ok_or(XRefError::Unknown)?;
45    let trailer =
46        populate_xref_impl(data.as_ref(), xref_pos, &mut xref_map).ok_or(XRefError::Unknown)?;
47
48    XRef::new(
49        data.clone(),
50        xref_map,
51        XRefInput::TrailerDictData(trailer),
52        false,
53        password,
54        limits,
55    )
56}
57
58/// Try to manually parse the PDF to build an xref table and trailer dictionary.
59pub(crate) fn fallback(data: PdfData, password: &[u8], limits: PdfLoadLimits) -> Option<XRef> {
60    warn!("xref table was invalid, trying to manually build xref table");
61    let (xref_map, xref_input) = fallback_xref_map(&data, password);
62
63    if let Some(xref_input) = xref_input {
64        warn!("rebuild xref table with {} entries", xref_map.len());
65
66        XRef::new(data.clone(), xref_map, xref_input, true, password, limits).ok()
67    } else {
68        warn!("couldn't find trailer dictionary, failed to rebuild xref table");
69
70        None
71    }
72}
73
74fn fallback_xref_map<'a>(data: &'a PdfData, password: &[u8]) -> (XrefMap, Option<XRefInput<'a>>) {
75    fallback_xref_map_inner(data, ReaderContext::dummy(), true, password)
76}
77
78fn fallback_xref_map_inner<'a>(
79    data: &'a PdfData,
80    mut dummy_ctx: ReaderContext<'a>,
81    recurse: bool,
82    password: &[u8],
83) -> (XrefMap, Option<XRefInput<'a>>) {
84    let mut xref_map = FxHashMap::default();
85    let mut trailer_dicts = vec![];
86    let mut root_ref = None;
87
88    let mut r = Reader::new(data.as_ref());
89
90    let mut last_obj_num = None;
91
92    loop {
93        let cur_pos = r.offset();
94
95        let mut old_r = r.clone();
96
97        if let Some(obj_id) = r.read::<ObjectIdentifier>(&dummy_ctx) {
98            let mut cloned = r.clone();
99            // Check that the object following it is actually valid before inserting it.
100            cloned.skip_white_spaces_and_comments();
101            if cloned.skip::<Object<'_>>(false).is_some() {
102                xref_map.insert(obj_id, EntryType::Normal(cur_pos));
103                last_obj_num = Some(obj_id);
104                dummy_ctx.set_obj_number(obj_id);
105            }
106        } else if let Some(dict) = r.read::<Dict<'_>>(&dummy_ctx) {
107            if dict.contains_key(ROOT) {
108                trailer_dicts.push(dict.clone());
109            }
110
111            if dict
112                .get::<Name>(TYPE)
113                .is_some_and(|n| n.as_str() == "Catalog")
114            {
115                root_ref = last_obj_num;
116            }
117
118            if let Some(stream) = old_r.read::<Stream<'_>>(&dummy_ctx)
119                && stream.dict().get::<Name>(TYPE).as_deref() == Some(b"ObjStm")
120                && let Some(data) = stream.decoded().ok()
121                && let Some(last_obj_num) = last_obj_num
122                && let Some(obj_stream) = ObjectStream::new(stream, &data, &dummy_ctx)
123            {
124                for (idx, (obj_num, _)) in obj_stream.offsets.iter().enumerate() {
125                    let id = ObjectIdentifier::new(*obj_num as i32, 0);
126                    // If we already found an entry for that object number that was not
127                    // inside an object stream. Somewhat arbitrary and maybe
128                    // we can do better, but that seems to work for the current
129                    // set of tests.
130                    if xref_map
131                        .get(&id)
132                        .is_none_or(|e| !matches!(e, &EntryType::Normal(_)))
133                    {
134                        xref_map.insert(
135                            id,
136                            EntryType::ObjStream(last_obj_num.obj_number as u32, idx as u32),
137                        );
138                    }
139                }
140            }
141        } else {
142            r.read_byte();
143        }
144
145        if r.at_end() {
146            break;
147        }
148    }
149
150    // Try to choose the right trailer dict by doing basic validation.
151    let mut trailer_dict = None;
152
153    for dict in trailer_dicts {
154        if let Some(root_id) = dict.get_raw::<Dict<'_>>(ROOT) {
155            let check = |dict: &Dict<'_>| -> bool { dict.contains_key(PAGES) };
156
157            match root_id {
158                MaybeRef::Ref(r) => match xref_map.get(&r.into()) {
159                    Some(EntryType::Normal(offset)) => {
160                        let mut reader = Reader::new(&data.as_ref()[*offset..]);
161
162                        if let Some(obj) =
163                            reader.read_with_context::<IndirectObject<Dict<'_>>>(&dummy_ctx)
164                            && check(&obj.clone().get())
165                        {
166                            trailer_dict = Some(dict);
167                        }
168                    }
169                    Some(EntryType::ObjStream(obj_num, idx)) => {
170                        if let Some(EntryType::Normal(offset)) =
171                            xref_map.get(&ObjectIdentifier::new(*obj_num as i32, 0))
172                        {
173                            let mut reader = Reader::new(&data.as_ref()[*offset..]);
174
175                            if let Some(stream) =
176                                reader.read_with_context::<IndirectObject<Stream<'_>>>(&dummy_ctx)
177                                && let Some(data) = stream.clone().get().decoded().ok()
178                                && let Some(object_stream) =
179                                    ObjectStream::new(stream.get(), &data, &dummy_ctx)
180                                && let Some(obj) = object_stream.get::<Dict<'_>>(*idx)
181                                && check(&obj)
182                            {
183                                trailer_dict = Some(dict);
184                            }
185                        }
186                    }
187                    _ => {}
188                },
189                MaybeRef::NotRef(d) => {
190                    if check(&d) {
191                        trailer_dict = Some(dict);
192                    }
193                }
194            }
195        }
196    }
197
198    let has_encryption = trailer_dict
199        .as_ref()
200        .is_some_and(|t| t.contains_key(ENCRYPT));
201
202    if has_encryption && recurse {
203        // The problem is that in this case, we have used a dummy reader context which does not have
204        // a decryptor. Therefore, we were unable to decrypt any of the object streams and missed
205        // all objects that are inside of such a stream. Therefore, we need to redo the process
206        // using a `ReaderContext` that does have the ability to decrypt.
207        if let Some(Ok(xref)) = trailer_dict.as_ref().map(|d| {
208            XRef::new(
209                data.clone(),
210                xref_map.clone(),
211                XRefInput::TrailerDictData(d.data()),
212                true,
213                password,
214                PdfLoadLimits::default(),
215            )
216        }) {
217            let ctx = ReaderContext::new(&xref, false);
218            let (patched_map, _) = fallback_xref_map_inner(data, ctx, false, password);
219            xref_map = patched_map;
220        }
221    }
222
223    if let Some(trailer_dict_data) = trailer_dict.map(|d| d.data()) {
224        (
225            xref_map,
226            Some(XRefInput::TrailerDictData(trailer_dict_data)),
227        )
228    } else if let Some(root_ref) = root_ref {
229        (xref_map, Some(XRefInput::RootRef(root_ref)))
230    } else {
231        (xref_map, None)
232    }
233}
234
235const DUMMY_XREF: XRef = XRef(Inner::Dummy);
236
237/// An xref table.
238#[derive(Debug, Clone)]
239pub struct XRef(Inner);
240
241impl XRef {
242    fn new(
243        data: PdfData,
244        xref_map: XrefMap,
245        input: XRefInput<'_>,
246        repaired: bool,
247        password: &[u8],
248        load_limits: PdfLoadLimits,
249    ) -> Result<Self, XRefError> {
250        // This is a bit hacky, but the problem is we can't read the resolved trailer dictionary
251        // before we actually created the xref struct. So we first create it using dummy data
252        // and then populate the data.
253        let trailer_data = TrailerData::dummy();
254
255        let mut xref = Self(Inner::Some(Arc::new(SomeRepr {
256            data: Arc::new(Data::new(data)),
257            map: Arc::new(RwLock::new(MapRepr { xref_map, repaired })),
258            decryptor: Arc::new(Decryptor::None),
259            has_ocgs: false,
260            metadata: Arc::new(Metadata::default()),
261            trailer_data,
262            password: password.to_vec(),
263            load_limits,
264        })));
265
266        // We read the trailer twice, once to determine the encryption used and then a second
267        // time to resolve the catalog dictionary, etc. This allows us to support catalog dictionaries
268        // that are stored in an encrypted object stream.
269
270        let decryptor = {
271            match input {
272                XRefInput::TrailerDictData(trailer_dict_data) => {
273                    let mut r = Reader::new(trailer_dict_data);
274
275                    let trailer_dict = r
276                        .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
277                        .ok_or(XRefError::Unknown)?;
278
279                    get_decryptor(&trailer_dict, password)?
280                }
281                XRefInput::RootRef(_) => Decryptor::None,
282            }
283        };
284
285        match &mut xref.0 {
286            Inner::Dummy => unreachable!(),
287            Inner::Some(r) => {
288                let mutable = Arc::make_mut(r);
289                mutable.decryptor = Arc::new(decryptor.clone());
290            }
291        }
292
293        let (trailer_data, has_ocgs, metadata) = match input {
294            XRefInput::TrailerDictData(trailer_dict_data) => {
295                let mut r = Reader::new(trailer_dict_data);
296
297                let trailer_dict = r
298                    .read_with_context::<Dict<'_>>(&ReaderContext::new(&xref, false))
299                    .ok_or(XRefError::Unknown)?;
300
301                let root_ref = trailer_dict.get_ref(ROOT).ok_or(XRefError::Unknown)?;
302                let root = trailer_dict
303                    .get::<Dict<'_>>(ROOT)
304                    .ok_or(XRefError::Unknown)?;
305                let metadata = trailer_dict
306                    .get::<Dict<'_>>(INFO)
307                    .map(|d| parse_metadata(&d))
308                    .unwrap_or_default();
309                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
310                let has_ocgs = root.get::<Dict<'_>>(OCPROPERTIES).is_some();
311                let version = root
312                    .get::<Name>(VERSION)
313                    .and_then(|v| PdfVersion::from_bytes(v.deref()));
314
315                let td = TrailerData {
316                    pages_ref: pages_ref.into(),
317                    root_ref: root_ref.into(),
318                    version,
319                };
320
321                (td, has_ocgs, metadata)
322            }
323            XRefInput::RootRef(root_ref) => {
324                let root = xref.get::<Dict<'_>>(root_ref).ok_or(XRefError::Unknown)?;
325                let pages_ref = root.get_ref(PAGES).ok_or(XRefError::Unknown)?;
326
327                let td = TrailerData {
328                    pages_ref: pages_ref.into(),
329                    root_ref,
330                    version: None,
331                };
332
333                (td, false, Metadata::default())
334            }
335        };
336
337        match &mut xref.0 {
338            Inner::Dummy => unreachable!(),
339            Inner::Some(r) => {
340                let mutable = Arc::make_mut(r);
341                mutable.trailer_data = trailer_data;
342                mutable.decryptor = Arc::new(decryptor);
343                mutable.has_ocgs = has_ocgs;
344                mutable.metadata = Arc::new(metadata);
345            }
346        }
347
348        Ok(xref)
349    }
350
351    fn is_repaired(&self) -> bool {
352        match &self.0 {
353            Inner::Dummy => false,
354            Inner::Some(r) => {
355                let locked = r.map.get();
356                locked.repaired
357            }
358        }
359    }
360
361    pub(crate) fn dummy() -> &'static Self {
362        &DUMMY_XREF
363    }
364
365    pub(crate) fn load_limits(&self) -> PdfLoadLimits {
366        match &self.0 {
367            Inner::Dummy => PdfLoadLimits::default(),
368            Inner::Some(r) => r.load_limits,
369        }
370    }
371
372    pub(crate) fn len(&self) -> usize {
373        match &self.0 {
374            Inner::Dummy => 0,
375            Inner::Some(r) => r.map.get().xref_map.len(),
376        }
377    }
378
379    pub(crate) fn trailer_data(&self) -> &TrailerData {
380        match &self.0 {
381            Inner::Dummy => unreachable!(),
382            Inner::Some(r) => &r.trailer_data,
383        }
384    }
385
386    /// Number of cached parsed object-stream offset tables. QF2-B test
387    /// hook; not part of the public API.
388    #[cfg(test)]
389    pub(crate) fn object_stream_offsets_cache_len(&self) -> usize {
390        match &self.0 {
391            Inner::Dummy => 0,
392            Inner::Some(r) => r.data.object_stream_offsets_cache_len(),
393        }
394    }
395
396    pub(crate) fn metadata(&self) -> &Metadata {
397        match &self.0 {
398            Inner::Dummy => unreachable!(),
399            Inner::Some(r) => &r.metadata,
400        }
401    }
402
403    /// Return the object ID of the root dictionary.
404    pub fn root_id(&self) -> ObjectIdentifier {
405        self.trailer_data().root_ref
406    }
407
408    /// Whether the PDF has optional content groups.
409    pub fn has_optional_content_groups(&self) -> bool {
410        match &self.0 {
411            Inner::Dummy => false,
412            Inner::Some(r) => r.has_ocgs,
413        }
414    }
415
416    pub(crate) fn objects(&self) -> impl IntoIterator<Item = Object<'_>> + '_ {
417        match &self.0 {
418            Inner::Dummy => unimplemented!(),
419            Inner::Some(r) => {
420                let locked = r.map.get();
421                let mut elements = locked
422                    .xref_map
423                    .iter()
424                    .map(|(id, e)| {
425                        let offset = match e {
426                            EntryType::Normal(o) => (*o, 0),
427                            EntryType::ObjStream(id, index) => {
428                                if let Some(EntryType::Normal(offset)) =
429                                    locked.xref_map.get(&ObjectIdentifier::new(*id as i32, 0))
430                                {
431                                    (*offset, *index)
432                                } else {
433                                    (usize::MAX, 0)
434                                }
435                            }
436                        };
437
438                        (*id, offset)
439                    })
440                    .collect::<Vec<_>>();
441
442                // Try to yield in the order the objects appeared in the
443                // PDF.
444                elements.sort_by_key(|e1| e1.1);
445
446                let mut iter = elements.into_iter();
447
448                iter::from_fn(move || {
449                    for next in iter.by_ref() {
450                        if let Some(obj) = self.get_with(next.0, &ReaderContext::new(self, false)) {
451                            return Some(obj);
452                        } else {
453                            // Skip invalid objects.
454                            continue;
455                        }
456                    }
457
458                    None
459                })
460            }
461        }
462    }
463
464    pub(crate) fn repair(&self) {
465        let Inner::Some(r) = &self.0 else {
466            unreachable!();
467        };
468
469        let mut locked = r
470            .map
471            .try_put()
472            .expect("xref repair: map lock not contended");
473        assert!(!locked.repaired);
474
475        let (xref_map, _) = fallback_xref_map(r.data.get(), &r.password);
476        locked.xref_map = xref_map;
477        locked.repaired = true;
478    }
479
480    #[inline]
481    pub(crate) fn needs_decryption(&self, ctx: &ReaderContext<'_>) -> bool {
482        match &self.0 {
483            Inner::Dummy => false,
484            Inner::Some(r) => {
485                if matches!(r.decryptor.as_ref(), Decryptor::None) {
486                    false
487                } else {
488                    !ctx.in_content_stream() && !ctx.in_object_stream()
489                }
490            }
491        }
492    }
493
494    #[inline]
495    pub(crate) fn decrypt(
496        &self,
497        id: ObjectIdentifier,
498        data: &[u8],
499        target: DecryptionTarget,
500    ) -> Option<Vec<u8>> {
501        match &self.0 {
502            Inner::Dummy => Some(data.to_vec()),
503            Inner::Some(r) => r.decryptor.decrypt(id, data, target),
504        }
505    }
506
507    /// Return the object with the given identifier.
508    #[allow(private_bounds)]
509    pub fn get<'a, T>(&'a self, id: ObjectIdentifier) -> Option<T>
510    where
511        T: ObjectLike<'a>,
512    {
513        let ctx = ReaderContext::new(self, false);
514        self.get_with(id, &ctx)
515    }
516
517    /// Return the object with the given identifier.
518    #[allow(private_bounds)]
519    pub(crate) fn get_with<'a, T>(
520        &'a self,
521        id: ObjectIdentifier,
522        ctx: &ReaderContext<'a>,
523    ) -> Option<T>
524    where
525        T: ObjectLike<'a>,
526    {
527        let Inner::Some(repr) = &self.0 else {
528            return None;
529        };
530
531        let locked = repr.map.try_get()?;
532
533        let mut r = Reader::new(repr.data.get().as_ref());
534
535        let entry = *locked.xref_map.get(&id).or({
536            // An indirect reference to an undefined object shall not be considered an error by a PDF processor; it
537            // shall be treated as a reference to the null object.
538            None
539        })?;
540        drop(locked);
541
542        let mut ctx = ctx.clone();
543        ctx.set_obj_number(id);
544        ctx.set_in_content_stream(false);
545
546        match entry {
547            EntryType::Normal(offset) => {
548                ctx.set_in_object_stream(false);
549                r.jump(offset);
550
551                if let Some(object) = r.read_with_context::<IndirectObject<T>>(&ctx) {
552                    if object.id() == &id {
553                        return Some(object.get());
554                    }
555                } else {
556                    // There is a valid object at the offset, it's just not of the type the caller
557                    // expected, which is fine.
558                    if r.skip_not_in_content_stream::<IndirectObject<Object<'_>>>()
559                        .is_some()
560                    {
561                        return None;
562                    }
563                };
564
565                // The xref table is broken, try to repair if not already repaired.
566                if self.is_repaired() {
567                    error!(
568                        "attempt was made at repairing xref, but object {id:?} still couldn't be read"
569                    );
570
571                    None
572                } else {
573                    warn!("broken xref, attempting to repair");
574
575                    self.repair();
576
577                    // Now try reading again.
578                    self.get_with::<T>(id, &ctx)
579                }
580            }
581            EntryType::ObjStream(obj_stram_gen_num, index) => {
582                // Generation number is implicitly 0.
583                let obj_stream_id = ObjectIdentifier::new(obj_stram_gen_num as i32, 0);
584
585                if obj_stream_id == id {
586                    warn!("cycle detected in object stream");
587
588                    return None;
589                }
590
591                let stream = self.get_with::<Stream<'_>>(obj_stream_id, &ctx)?;
592                let data = repr.data.get_with(obj_stream_id, &ctx)?;
593                // QF2-B: re-use a cached `(obj_num, offset)` index table if
594                // we've already parsed this `/ObjStm` once for this
595                // document. The cache lives on the per-document `Data` and
596                // is dropped together with it.
597                let offsets = repr
598                    .data
599                    .get_object_stream_offsets_or_init(obj_stream_id, || {
600                        parse_object_stream_offsets(&stream, data)
601                    })?;
602                let object_stream = ObjectStream::from_cached_offsets(data, &ctx, offsets);
603                object_stream.get(index)
604            }
605        }
606    }
607}
608
609/// An input that is passed to the xref constructor so that we can fully resolve
610/// the PDF.
611#[derive(Debug, Copy, Clone)]
612pub(crate) enum XRefInput<'a> {
613    /// This option is going to be uesd in 99.999% of the case. It contains the
614    /// raw data of the trailer dictionary which is then going to be processed.
615    TrailerDictData(&'a [u8]),
616    /// In case the trailer dictionary could not be read (for example because
617    /// it is cut-off), we just pass the object ID of the root dictionary
618    /// in case we have found one, and try our best to build the PDF just
619    /// with the information we have there.
620    ///
621    /// Note that this won't work if the document is encrypted, as we
622    /// can't access the crypto dictionary.
623    RootRef(ObjectIdentifier),
624}
625
626pub(crate) fn find_last_xref_pos(data: &[u8]) -> Option<usize> {
627    let mut finder = Reader::new(data);
628    let mut pos = finder.len().checked_sub(1)?;
629    finder.jump(pos);
630
631    let needle = b"startxref";
632
633    loop {
634        if finder.forward_tag(needle).is_some() {
635            finder.skip_white_spaces_and_comments();
636
637            let offset = finder.read_without_context::<i32>()?.try_into().ok()?;
638
639            return Some(offset);
640        }
641
642        pos = pos.checked_sub(1)?;
643        finder.jump(pos);
644    }
645}
646
647/// A type of xref entry.
648#[derive(Debug, PartialEq, Eq, Clone, Copy)]
649enum EntryType {
650    /// An indirect object that is at a specific offset in the original data.
651    Normal(usize),
652    /// An indirect object that is part of an object stream. First number indicates the object
653    /// number of the _object stream_ (the generation number is always 0), the second number indicates
654    /// the index in the object stream.
655    ObjStream(u32, u32),
656}
657
658type XrefMap = FxHashMap<ObjectIdentifier, EntryType>;
659
660/// Representation of a proper xref table.
661#[derive(Debug)]
662struct MapRepr {
663    xref_map: XrefMap,
664    repaired: bool,
665}
666
667#[derive(Debug, Copy, Clone)]
668pub(crate) struct TrailerData {
669    pub(crate) pages_ref: ObjectIdentifier,
670    pub(crate) root_ref: ObjectIdentifier,
671    pub(crate) version: Option<PdfVersion>,
672}
673
674impl TrailerData {
675    pub(crate) fn dummy() -> Self {
676        Self {
677            pages_ref: ObjectIdentifier::new(0, 0),
678            root_ref: ObjectIdentifier::new(0, 0),
679            version: None,
680        }
681    }
682}
683
684#[derive(Debug, Clone)]
685struct SomeRepr {
686    data: Arc<Data>,
687    map: Arc<RwLock<MapRepr>>,
688    metadata: Arc<Metadata>,
689    decryptor: Arc<Decryptor>,
690    has_ocgs: bool,
691    password: Vec<u8>,
692    trailer_data: TrailerData,
693    load_limits: PdfLoadLimits,
694}
695
696#[derive(Debug, Clone)]
697enum Inner {
698    /// A dummy xref table that doesn't have any entries.
699    Dummy,
700    /// A proper xref table.
701    Some(Arc<SomeRepr>),
702}
703
704#[derive(Debug)]
705struct XRefEntry {
706    offset: usize,
707    gen_number: i32,
708    used: bool,
709}
710
711impl XRefEntry {
712    pub(crate) fn read(data: &[u8]) -> Option<Self> {
713        #[inline(always)]
714        fn parse_u32(data: &[u8]) -> Option<u32> {
715            let mut accum = 0_u32;
716
717            for byte in data {
718                accum = accum.checked_mul(10)?;
719
720                match *byte {
721                    b'0'..=b'9' => accum = accum.checked_add((*byte - b'0') as u32)?,
722                    _ => return None,
723                }
724            }
725
726            Some(accum)
727        }
728
729        let offset = parse_u32(&data[0..10])? as usize;
730        let gen_number = i32::try_from(parse_u32(&data[11..16])?).ok()?;
731
732        let used = data[17] == b'n';
733
734        Some(Self {
735            offset,
736            gen_number,
737            used,
738        })
739    }
740}
741
742/// Maximum depth for following xref Prev/XRefStm chains to prevent stack
743/// overflow on circular or deeply chained xref tables.
744const MAX_XREF_CHAIN_DEPTH: usize = 64;
745
746fn populate_xref_impl<'a>(data: &'a [u8], pos: usize, xref_map: &mut XrefMap) -> Option<&'a [u8]> {
747    populate_xref_depth(data, pos, xref_map, 0)
748}
749
750fn populate_xref_depth<'a>(
751    data: &'a [u8],
752    pos: usize,
753    xref_map: &mut XrefMap,
754    depth: usize,
755) -> Option<&'a [u8]> {
756    if depth > MAX_XREF_CHAIN_DEPTH {
757        log::warn!("Xref chain depth exceeds {MAX_XREF_CHAIN_DEPTH}, stopping traversal");
758        return None;
759    }
760    let mut reader = Reader::new(data);
761    reader.jump(pos);
762    // In case the position points to before the object number of a xref stream.
763    reader.skip_white_spaces_and_comments();
764
765    let mut r2 = reader.clone();
766    if reader
767        .clone()
768        .read_without_context::<ObjectIdentifier>()
769        .is_some()
770    {
771        populate_from_xref_stream(data, &mut r2, xref_map, depth)
772    } else {
773        populate_from_xref_table(data, &mut r2, xref_map, depth)
774    }
775}
776
777pub(super) struct SubsectionHeader {
778    pub(super) start: u32,
779    pub(super) num_entries: u32,
780}
781
782impl Readable<'_> for SubsectionHeader {
783    fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
784        r.skip_white_spaces();
785        let start = r.read_without_context::<u32>()?;
786        r.skip_white_spaces();
787        let num_entries = r.read_without_context::<u32>()?;
788        r.skip_white_spaces();
789
790        Some(Self { start, num_entries })
791    }
792}
793
794/// Populate the xref table, and return the trailer dict.
795fn populate_from_xref_table<'a>(
796    data: &'a [u8],
797    reader: &mut Reader<'a>,
798    insert_map: &mut XrefMap,
799    depth: usize,
800) -> Option<&'a [u8]> {
801    let trailer = {
802        let mut reader = reader.clone();
803        read_xref_table_trailer(&mut reader, &ReaderContext::dummy())?
804    };
805
806    reader.skip_white_spaces();
807    reader.forward_tag(b"xref")?;
808    reader.skip_white_spaces();
809
810    let mut max_obj = 0;
811
812    if let Some(prev) = trailer.get::<i32>(PREV) {
813        // First insert the entries from any previous xref tables.
814        populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
815    }
816
817    // In hybrid files, entries in `XRefStm` should have higher priority, therefore we insert them
818    // after looking at `PREV`.
819    if let Some(xref_stm) = trailer.get::<i32>(XREF_STM) {
820        populate_xref_depth(data, xref_stm as usize, insert_map, depth + 1)?;
821    }
822
823    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
824        reader.skip_white_spaces();
825
826        let start = header.start;
827        let end = start + header.num_entries;
828
829        for obj_number in start..end {
830            max_obj = max(max_obj, obj_number);
831            let bytes = reader.read_bytes(XREF_ENTRY_LEN)?;
832            let entry = XRefEntry::read(bytes)?;
833
834            // Specification says we should ignore any object number > SIZE, but probably
835            // not important?
836            if entry.used {
837                insert_map.insert(
838                    ObjectIdentifier::new(obj_number as i32, entry.gen_number),
839                    EntryType::Normal(entry.offset),
840                );
841            }
842        }
843    }
844
845    Some(trailer.data())
846}
847
848fn populate_from_xref_stream<'a>(
849    data: &'a [u8],
850    reader: &mut Reader<'a>,
851    insert_map: &mut XrefMap,
852    depth: usize,
853) -> Option<&'a [u8]> {
854    let stream = reader
855        .read_with_context::<IndirectObject<Stream<'_>>>(&ReaderContext::dummy())?
856        .get();
857
858    if let Some(prev) = stream.dict().get::<i32>(PREV) {
859        // First insert the entries from any previous xref tables.
860        let _ = populate_xref_depth(data, prev as usize, insert_map, depth + 1)?;
861    }
862
863    let size = stream.dict().get::<u32>(SIZE)?;
864
865    let [f1_len, f2_len, f3_len] = stream.dict().get::<[u8; 3]>(W)?;
866
867    if f2_len > size_of::<u64>() as u8 {
868        error!("xref offset length is larger than the allowed limit");
869
870        return None;
871    }
872
873    // Do such files exist?
874    if f1_len != 1 {
875        warn!("first field in xref stream was longer than 1");
876    }
877
878    let xref_data = stream.decoded().ok()?;
879    let mut xref_reader = Reader::new(xref_data.as_ref());
880
881    if let Some(arr) = stream.dict().get::<Array<'_>>(INDEX) {
882        let iter = arr.iter::<(u32, u32)>();
883
884        for (start, num_elements) in iter {
885            xref_stream_subsection(
886                &mut xref_reader,
887                start,
888                num_elements,
889                f1_len,
890                f2_len,
891                f3_len,
892                insert_map,
893            )?;
894        }
895    } else {
896        xref_stream_subsection(
897            &mut xref_reader,
898            0,
899            size,
900            f1_len,
901            f2_len,
902            f3_len,
903            insert_map,
904        )?;
905    }
906
907    Some(stream.dict().data())
908}
909
910fn xref_stream_num(data: &[u8]) -> Option<u32> {
911    Some(match data.len() {
912        0 => return None,
913        1 => u8::from_be(data[0]) as u32,
914        2 => u16::from_be_bytes(data[0..2].try_into().ok()?) as u32,
915        3 => u32::from_be_bytes([0, data[0], data[1], data[2]]),
916        4 => u32::from_be_bytes(data[0..4].try_into().ok()?),
917        8 => {
918            if let Ok(num) = u32::try_from(u64::from_be_bytes(data[0..8].try_into().ok()?)) {
919                return Some(num);
920            } else {
921                warn!("xref stream number is too large");
922
923                return None;
924            }
925        }
926        n => {
927            warn!("invalid xref stream number {n}");
928
929            return None;
930        }
931    })
932}
933
934fn xref_stream_subsection<'a>(
935    xref_reader: &mut Reader<'a>,
936    start: u32,
937    num_elements: u32,
938    f1_len: u8,
939    f2_len: u8,
940    f3_len: u8,
941    insert_map: &mut XrefMap,
942) -> Option<()> {
943    for i in 0..num_elements {
944        let f_type = if f1_len == 0 {
945            1
946        } else {
947            // We assume a length of 1.
948            xref_reader.read_bytes(1)?[0]
949        };
950
951        let obj_number = start + i;
952
953        match f_type {
954            // We don't care about free objects.
955            0 => {
956                xref_reader.skip_bytes(f2_len as usize + f3_len as usize)?;
957            }
958            1 => {
959                let offset = if f2_len > 0 {
960                    let data = xref_reader.read_bytes(f2_len as usize)?;
961                    xref_stream_num(data)?
962                } else {
963                    0
964                };
965
966                let gen_number = if f3_len > 0 {
967                    let data = xref_reader.read_bytes(f3_len as usize)?;
968                    xref_stream_num(data)?
969                } else {
970                    0
971                };
972
973                insert_map.insert(
974                    ObjectIdentifier::new(obj_number as i32, gen_number as i32),
975                    EntryType::Normal(offset as usize),
976                );
977            }
978            2 => {
979                let obj_stream_number = {
980                    let data = xref_reader.read_bytes(f2_len as usize)?;
981                    xref_stream_num(data)?
982                };
983                let gen_number = 0;
984                let index = if f3_len > 0 {
985                    let data = xref_reader.read_bytes(f3_len as usize)?;
986                    xref_stream_num(data)?
987                } else {
988                    0
989                };
990
991                insert_map.insert(
992                    ObjectIdentifier::new(obj_number as i32, gen_number),
993                    EntryType::ObjStream(obj_stream_number, index),
994                );
995            }
996            _ => {
997                warn!("xref has unknown field type {f_type}");
998
999                return None;
1000            }
1001        }
1002    }
1003
1004    Some(())
1005}
1006
1007fn read_xref_table_trailer<'a>(
1008    reader: &mut Reader<'a>,
1009    ctx: &ReaderContext<'a>,
1010) -> Option<Dict<'a>> {
1011    reader.skip_white_spaces();
1012    reader.forward_tag(b"xref")?;
1013    reader.skip_white_spaces();
1014
1015    while let Some(header) = reader.read_without_context::<SubsectionHeader>() {
1016        reader.jump(reader.offset() + XREF_ENTRY_LEN * header.num_entries as usize);
1017    }
1018
1019    reader.skip_white_spaces();
1020    reader.forward_tag(b"trailer")?;
1021    reader.skip_white_spaces();
1022
1023    reader.read_with_context::<Dict<'_>>(ctx)
1024}
1025
1026fn get_decryptor(trailer_dict: &Dict<'_>, password: &[u8]) -> Result<Decryptor, XRefError> {
1027    if let Some(encryption_dict) = trailer_dict.get::<Dict<'_>>(ENCRYPT) {
1028        let id = if let Some(id) = trailer_dict
1029            .get::<Array<'_>>(ID)
1030            .and_then(|a| a.flex_iter().next::<object::String>())
1031        {
1032            id.to_vec()
1033        } else {
1034            // Assume an empty ID entry.
1035            vec![]
1036        };
1037
1038        get(&encryption_dict, &id, password).map_err(XRefError::Encryption)
1039    } else {
1040        Ok(Decryptor::None)
1041    }
1042}
1043
1044/// Parse the `(obj_num, absolute_byte_offset)` index table that lives at the
1045/// start of a compressed `/ObjStm`.
1046///
1047/// Returns `None` if the stream dict is missing `/N` / `/First`, or if the
1048/// header is truncated. The returned table is the same value that an
1049/// `ObjectStream` would have populated internally before this was split out
1050/// (QF2-B). Splitting it allows the result to be cached per-document; see
1051/// [`crate::data::Data::get_object_stream_offsets_or_init`].
1052fn parse_object_stream_offsets(
1053    inner: &Stream<'_>,
1054    data: &[u8],
1055) -> Option<crate::data::ObjectStreamOffsets> {
1056    let num_objects = inner.dict().get::<usize>(N)?;
1057    let first_offset = inner.dict().get::<usize>(FIRST)?;
1058
1059    let mut r = Reader::new(data);
1060    let mut offsets = Vec::with_capacity(num_objects);
1061
1062    for _ in 0..num_objects {
1063        r.skip_white_spaces_and_comments();
1064        // Skip object number
1065        let obj_num = r.read_without_context::<u32>()?;
1066        r.skip_white_spaces_and_comments();
1067        let relative_offset = r.read_without_context::<usize>()?;
1068        offsets.push((obj_num, first_offset + relative_offset));
1069    }
1070
1071    Some(offsets)
1072}
1073
1074/// Holds a borrowed view onto the decoded bytes of an `/ObjStm` plus a
1075/// (possibly cached) parsed offset table.
1076///
1077/// QF2-B: `offsets` is now an `Arc<...>` so the same allocation can be
1078/// returned from the per-document cache on subsequent lookups, eliminating
1079/// the linear re-parse hot loop reported in
1080/// `QF1_A_FLAMEGRAPH_REPORT.md` (449× per main thread on the
1081/// `scan_for_xfa` fallback path).
1082struct ObjectStream<'a> {
1083    data: &'a [u8],
1084    ctx: ReaderContext<'a>,
1085    offsets: Arc<crate::data::ObjectStreamOffsets>,
1086}
1087
1088impl<'a> ObjectStream<'a> {
1089    /// Build a fresh `ObjectStream` by parsing the index table inline (no
1090    /// caching). Used by the xref-repair / trailer-fallback paths that
1091    /// don't have access to a `Data` cache.
1092    fn new(inner: Stream<'_>, data: &'a [u8], ctx: &ReaderContext<'a>) -> Option<Self> {
1093        let offsets = Arc::new(parse_object_stream_offsets(&inner, data)?);
1094
1095        let mut ctx = ctx.clone();
1096        ctx.set_in_object_stream(true);
1097
1098        Some(Self { data, ctx, offsets })
1099    }
1100
1101    /// Build an `ObjectStream` that reuses an already-parsed offsets table
1102    /// (typically retrieved from the per-document cache). Cheap — does no
1103    /// header scan.
1104    fn from_cached_offsets(
1105        data: &'a [u8],
1106        ctx: &ReaderContext<'a>,
1107        offsets: Arc<crate::data::ObjectStreamOffsets>,
1108    ) -> Self {
1109        let mut ctx = ctx.clone();
1110        ctx.set_in_object_stream(true);
1111
1112        Self { data, ctx, offsets }
1113    }
1114
1115    fn get<T>(&self, index: u32) -> Option<T>
1116    where
1117        T: ObjectLike<'a>,
1118    {
1119        let offset = self.offsets.get(index as usize)?.1;
1120        let mut r = Reader::new(self.data);
1121        r.jump(offset);
1122        r.skip_white_spaces_and_comments();
1123
1124        r.read_with_context::<T>(&self.ctx)
1125    }
1126}
1127
1128fn parse_metadata(info_dict: &Dict<'_>) -> Metadata {
1129    Metadata {
1130        creation_date: info_dict
1131            .get::<object::String>(CREATION_DATE)
1132            .and_then(|c| DateTime::from_bytes(&c)),
1133        modification_date: info_dict
1134            .get::<object::String>(MOD_DATE)
1135            .and_then(|c| DateTime::from_bytes(&c)),
1136        title: info_dict.get::<object::String>(TITLE).map(|t| t.to_vec()),
1137        author: info_dict.get::<object::String>(AUTHOR).map(|t| t.to_vec()),
1138        subject: info_dict.get::<object::String>(SUBJECT).map(|t| t.to_vec()),
1139        keywords: info_dict
1140            .get::<object::String>(KEYWORDS)
1141            .map(|t| t.to_vec()),
1142        creator: info_dict.get::<object::String>(CREATOR).map(|t| t.to_vec()),
1143        producer: info_dict
1144            .get::<object::String>(PRODUCER)
1145            .map(|t| t.to_vec()),
1146    }
1147}
1148
1149#[cfg(test)]
1150mod qf2b_objectstream_cache_tests {
1151    //! QF2-B — end-to-end coverage for the per-document ObjectStream
1152    //! offsets cache, using a real `/ObjStm`-containing fixture.
1153
1154    use crate::pdf::Pdf;
1155    use crate::xref::parse_object_stream_offsets;
1156
1157    /// Path to an in-tree XFA golden that contains at least one `/ObjStm`.
1158    /// Resolved relative to the pdf-syntax crate dir.
1159    const FIXTURE: &str = "../xfa-golden-tests/golden/13a7b224_xfa_issue14315.pdf";
1160
1161    fn load_fixture() -> Option<Pdf> {
1162        let bytes = std::fs::read(FIXTURE).ok()?;
1163        Pdf::new(bytes).ok()
1164    }
1165
1166    #[test]
1167    fn qf2b_objstm_cache_populates_and_is_stable_on_repeat() {
1168        let Some(pdf) = load_fixture() else {
1169            // Fixture is in-tree, but be defensive if running with a
1170            // pruned workspace.
1171            return;
1172        };
1173
1174        let xref = pdf.xref();
1175
1176        // `Pdf::new` resolves the trailer and catalog during construction;
1177        // for PDF 1.5+ files those typically live in an `/ObjStm` so the
1178        // cache is already non-empty at this point. That is itself
1179        // evidence that the cache is active.
1180        let after_construction = xref.object_stream_offsets_cache_len();
1181        assert!(
1182            after_construction >= 1,
1183            "fixture is a PDF 1.5+ doc with /ObjStm; at least one offsets table should already be cached after construction; got {after_construction}"
1184        );
1185
1186        // Resolve the catalog explicitly — must not grow the cache because
1187        // the /ObjStm carrying the catalog is already a hit.
1188        let _: Option<crate::object::Dict<'_>> = xref.get(xref.root_id());
1189        assert_eq!(
1190            xref.object_stream_offsets_cache_len(),
1191            after_construction,
1192            "repeated resolution of the same indirect object must reuse the cached offsets table"
1193        );
1194
1195        // Resolve a number of additional indirect objects. Each new
1196        // `/ObjStm` we touch may add one entry, but re-touching anything
1197        // already seen must not.
1198        for raw in 1..=20i32 {
1199            let id = crate::object::ObjectIdentifier::new(raw, 0);
1200            let _: Option<crate::object::Dict<'_>> = xref.get(id);
1201        }
1202        let after_scan = xref.object_stream_offsets_cache_len();
1203
1204        // Idempotency: a second sweep must not grow the cache further.
1205        for raw in 1..=20i32 {
1206            let id = crate::object::ObjectIdentifier::new(raw, 0);
1207            let _: Option<crate::object::Dict<'_>> = xref.get(id);
1208        }
1209        assert_eq!(
1210            xref.object_stream_offsets_cache_len(),
1211            after_scan,
1212            "repeated full scans must be cache-stable (no re-parse)"
1213        );
1214    }
1215
1216    #[test]
1217    fn qf2b_two_pdfs_have_independent_caches() {
1218        let Some(pdf_a) = load_fixture() else {
1219            return;
1220        };
1221        let Some(pdf_b) = load_fixture() else {
1222            return;
1223        };
1224
1225        // Sanity: both start with the same construction-time count for
1226        // the same fixture (same shape).
1227        let base_a = pdf_a.xref().object_stream_offsets_cache_len();
1228        let base_b = pdf_b.xref().object_stream_offsets_cache_len();
1229        assert_eq!(base_a, base_b);
1230
1231        // Touch many ids in pdf_a to (likely) populate additional /ObjStm
1232        // cache entries.
1233        for raw in 1..=50i32 {
1234            let id = crate::object::ObjectIdentifier::new(raw, 0);
1235            let _: Option<crate::object::Dict<'_>> = pdf_a.xref().get(id);
1236        }
1237        let warm_a = pdf_a.xref().object_stream_offsets_cache_len();
1238
1239        // pdf_b must NOT have grown — caches are per-document.
1240        assert_eq!(
1241            pdf_b.xref().object_stream_offsets_cache_len(),
1242            base_b,
1243            "pdf_b cache must be independent of pdf_a's warming (base_a={base_a}, warm_a={warm_a}, base_b={base_b})"
1244        );
1245    }
1246
1247    #[test]
1248    fn qf2b_parse_helper_returns_none_on_truncated_header() {
1249        // Synthetic: dict says N=3 but data has only one (num, offset)
1250        // pair. The helper must return None, and the caller must not
1251        // cache a `None`.
1252        use crate::object::Stream;
1253        use crate::reader::{Reader, ReaderContext, ReaderExt};
1254        use crate::xref::DUMMY_XREF;
1255
1256        // Build a minimal indirect stream object with /N 3 /First 6:
1257        // "1 0 obj <</N 3/First 6/Length 4>>stream\n1 0\nendstream\nendobj\n"
1258        let raw: &[u8] = b"1 0 obj <</N 3 /First 6 /Length 4>>\nstream\n1 0 \nendstream\nendobj\n";
1259        let mut r = Reader::new(raw);
1260        let ctx = ReaderContext::new(&DUMMY_XREF, false);
1261        let stream: Stream<'_> = r
1262            .read_with_context::<crate::object::indirect::IndirectObject<Stream<'_>>>(&ctx)
1263            .expect("synthetic stream should parse")
1264            .get();
1265
1266        // The stream body has only "1 0 " — three entries cannot be
1267        // recovered, so the helper must return `None`.
1268        let body: &[u8] = b"1 0 ";
1269        assert!(
1270            parse_object_stream_offsets(&stream, body).is_none(),
1271            "truncated headers must not produce a partial offsets table"
1272        );
1273    }
1274
1275    /// QF2-B perf harness. Not a correctness test — it prints microbench
1276    /// numbers and only runs when explicitly requested with
1277    /// `--ignored qf2b_bench`. The harness compares **direct re-parse**
1278    /// of /ObjStm offsets tables (what the pre-QF2-B `ObjectStream::new`
1279    /// did on every `xref.get` of an /ObjStm-stored object) versus the
1280    /// QF2-B cached lookup. This isolates the parse cost from the
1281    /// downstream object-decoding cost, which dominates `xref.get` and
1282    /// would otherwise hide the cache win.
1283    #[test]
1284    #[ignore = "perf measurement; run with `cargo test --release -- --ignored qf2b_bench`"]
1285    fn qf2b_bench_offsets_parse_vs_cached() {
1286        use std::time::Instant;
1287
1288        // 161 /ObjStm headers, 575 KB. Walking and decoding 30+ ObjStms
1289        // here is enough to measure the parse delta cleanly.
1290        let path = "../../corpus/f3800.pdf";
1291        let Ok(bytes) = std::fs::read(path) else {
1292            eprintln!("[qf2b_bench] fixture {path} unavailable; skipping");
1293            return;
1294        };
1295        let pdf = Pdf::new(bytes).expect("load f3800.pdf");
1296        let xref = pdf.xref();
1297
1298        // Warm cache via a single full sweep so we know which /ObjStms
1299        // exist.
1300        let max_id = (xref.len() as i32).min(3000);
1301        for n in 1..=max_id {
1302            let id = crate::object::ObjectIdentifier::new(n, 0);
1303            let _: Option<crate::object::Object<'_>> = xref.get(id);
1304        }
1305        let cached_objstms = xref.object_stream_offsets_cache_len();
1306        assert!(
1307            cached_objstms >= 5,
1308            "fixture must trigger several /ObjStms (got {cached_objstms})"
1309        );
1310
1311        // Collect the object-stream container ids by looking them up in
1312        // the xref entries. We then iterate the cache to compare timing
1313        // for parse-from-scratch vs cache-hit.
1314        let mut objstm_ids: Vec<crate::object::ObjectIdentifier> = Vec::new();
1315        for n in 1..=max_id {
1316            let id = crate::object::ObjectIdentifier::new(n, 0);
1317            // Only ObjStm container ids resolve as Stream + /Type ObjStm.
1318            if let Some(stream) = xref.get::<crate::object::Stream<'_>>(id)
1319                && stream
1320                    .dict()
1321                    .get::<crate::object::Name>(crate::object::dict::keys::TYPE)
1322                    .as_deref()
1323                    == Some(b"ObjStm")
1324            {
1325                objstm_ids.push(id);
1326            }
1327            if objstm_ids.len() >= cached_objstms {
1328                break;
1329            }
1330        }
1331        let containers = objstm_ids.len();
1332        assert!(containers > 0);
1333
1334        // Direct re-parse loop — mirrors pre-QF2-B behaviour: parse the
1335        // offsets table from scratch every time, no cache.
1336        const REPEATS: u32 = 200;
1337        let mut sink_parse = 0usize;
1338        let t_parse = Instant::now();
1339        for _ in 0..REPEATS {
1340            for id in &objstm_ids {
1341                let stream = xref
1342                    .get::<crate::object::Stream<'_>>(*id)
1343                    .expect("stream resolves");
1344                let Ok(decoded) = stream.decoded() else {
1345                    continue;
1346                };
1347                if let Some(offs) = parse_object_stream_offsets(&stream, &decoded) {
1348                    sink_parse = sink_parse.wrapping_add(offs.len());
1349                }
1350            }
1351        }
1352        let parse_elapsed = t_parse.elapsed();
1353
1354        // Cache-hit loop — mirrors QF2-B behaviour: retrieve the same
1355        // parsed table from the per-document cache.
1356        let inner = match &xref.0 {
1357            crate::xref::Inner::Some(r) => r.clone(),
1358            _ => unreachable!(),
1359        };
1360        let mut sink_cache = 0usize;
1361        let t_cache = Instant::now();
1362        for _ in 0..REPEATS {
1363            for id in &objstm_ids {
1364                let offs = inner
1365                    .data
1366                    .get_object_stream_offsets_or_init(*id, || {
1367                        let stream = xref
1368                            .get::<crate::object::Stream<'_>>(*id)
1369                            .expect("stream resolves");
1370                        let decoded = stream.decoded().ok()?;
1371                        parse_object_stream_offsets(&stream, &decoded)
1372                    })
1373                    .expect("cached entry must exist after warm-up");
1374                sink_cache = sink_cache.wrapping_add(offs.len());
1375            }
1376        }
1377        let cache_elapsed = t_cache.elapsed();
1378
1379        assert_eq!(
1380            sink_parse, sink_cache,
1381            "parsed and cached results must agree on offset-count totals"
1382        );
1383
1384        let speedup = parse_elapsed.as_secs_f64() / cache_elapsed.as_secs_f64().max(1e-9);
1385        let reduction = (1.0 - cache_elapsed.as_secs_f64() / parse_elapsed.as_secs_f64()) * 100.0;
1386
1387        eprintln!("[qf2b_bench] fixture: f3800.pdf");
1388        eprintln!("[qf2b_bench] /ObjStm containers measured: {containers}");
1389        eprintln!("[qf2b_bench] iterations per container:    {REPEATS}");
1390        eprintln!("[qf2b_bench] direct re-parse total:       {parse_elapsed:?}");
1391        eprintln!("[qf2b_bench] cached lookup total:         {cache_elapsed:?}");
1392        eprintln!("[qf2b_bench] speedup:                     {speedup:.1}x");
1393        eprintln!("[qf2b_bench] parse-time reduction:        {reduction:.1}%");
1394
1395        // Acceptance gate: QF2-B target is ≥ 10 % reduction on parse path.
1396        // The microbench should show much more than that, since the cache
1397        // hit is O(1) hashmap fetch + Arc clone vs O(N) memchr+nom parse.
1398        assert!(
1399            reduction >= 10.0,
1400            "QF2-B acceptance: ≥ 10 % parse-time reduction required; got {reduction:.2} %"
1401        );
1402    }
1403}