pdf_xfa/
flatten.rs

1//! # XFA Flattening Pipeline
2//!
3//! This module parses XFA template, runs layout, and writes PDF content streams.
4//!
5//! ## Pipeline Stages
6//!
7//! 1. **Extract** — `extract_embedded_fonts()` reads font programs and /Widths
8//!    arrays from PDF font dictionaries
9//! 2. **Store** — `store_font_data()` saves font bytes + widths keyed by name
10//! 3. **Resolve** — `XfaFontResolver::resolve()` matches XFA font names to
11//!    stored fonts, with fallbacks (alias, family, system)
12//! 4. **Inject** — `inject_resolved_metrics()` pushes resolved widths into
13//!    FontMetrics for the layout engine
14//! 5. **Layout** — `LayoutEngine::layout()` computes page positions using
15//!    resolved font metrics for accurate text measurement
16//! 6. **Render** — `generate_page_overlay()` in render_bridge converts LayoutDom
17//!    to PDF content stream operators
18//! 7. **Embed** — `embed_resolved_fonts()` writes font data into the PDF
19//!    and creates /Font resources
20//! 8. **Write** — The content streams are written back to PDF pages
21//!
22//! ## Static vs Dynamic Forms
23//!
24//! XFA Spec 3.3 §1.7 (p28-30):
25//! - **Static (XFAF)**: boilerplate in PDF, fields/subforms in XFA. Fixed layout.
26//! - **Dynamic (full XFA)**: all content in XFA. Layout computed at runtime.
27//! - `baseProfile="interactiveForms"` indicates static (XFAF) forms.
28//!
29//! XFA Spec 3.3 §2.9 (p72) — PDF-XFA Connection:
30//! - NeedsRendering flag: dynamic=true, XFAF=false.
31//! - XFA packets stored in AcroForm/XFA entry in catalog.
32//!
33//! ## /Widths Handling
34//!
35//! PDF /Widths arrays start at FirstChar (typically 32). For simple fonts we
36//! remap those code-indexed widths through the font encoding so the layout
37//! engine receives Unicode-indexed measurements.
38//!
39//! ## CID Font /W Arrays (PDF spec §9.7.4.3)
40//!
41//! CID fonts (Type0/composite) use `/W` arrays in the CIDFont descendant
42//! dictionary instead of simple `/Widths`. Two element types:
43//!   - `cid_start [w1 w2 ...]` — consecutive CIDs starting at cid_start
44//!   - `cid_first cid_last width` — range of CIDs with same width
45//!
46//! `/DW` (default width, defaults to 1000) covers CIDs not in `/W`.
47//!
48//! ## Known Limitations
49//!
50//! - CID-to-Unicode mapping (ToUnicode CMap) is not yet parsed
51//! - System font fallback may have different metrics than the PDF's embedded font
52
53use lopdf::{dictionary, Dictionary, Document, Object, ObjectId, Stream, StringFormat};
54use std::cell::Cell;
55use std::collections::{HashMap, HashSet};
56use std::fmt::Write as FmtWrite;
57use std::thread;
58use std::time::Duration;
59
60// GL-QA36: Re-entrance guard for flatten_xfa_to_pdf.
61//
62// When the XFA layout fails and static_fallback returns the original bytes
63// unchanged (because lopdf also cannot parse the file), a caller that retries
64// flatten on those same bytes will trigger the same failure path again,
65// causing infinite recursion and ultimately a stack overflow.
66//
67// This thread-local counter is incremented on entry to flatten_xfa_to_pdf and
68// decremented by a drop guard on exit.  If the counter is already ≥ 1 when
69// the function is entered, we return an error immediately to break the cycle.
70//
71// The counter is thread-local so the spawned worker thread (thread::spawn
72// inside flatten_xfa_to_pdf) starts with its own fresh counter = 0 and is
73// not affected by the caller's guard.
74thread_local! {
75    static FLATTEN_DEPTH: Cell<u32> = const { Cell::new(0) };
76}
77
78#[cfg(feature = "xfa-js-sandboxed")]
79use crate::dynamic::apply_dynamic_scripts_with_runtime;
80use crate::dynamic::{
81    apply_dynamic_scripts, apply_dynamic_scripts_with_mode, DynamicScriptOutcome, JsExecutionMode,
82    OutputQuality,
83};
84use crate::error::{Result, XfaError};
85use crate::extract::extract_xfa_from_bytes;
86use crate::font_bridge::{
87    font_variant_key, pdf_glyph_name_to_unicode, CidFontInfo, EmbeddedFontData, PdfBaseEncoding,
88    PdfSimpleEncoding, PdfSourceFont, ResolvedFont, XfaFontResolver, XfaFontSpec,
89};
90use crate::image_bridge::embed_image;
91use crate::javascript_policy::{self, JavaScriptEntryPoint};
92use crate::merger::FormMerger;
93use crate::render_bridge::{
94    generate_all_overlays, generate_field_values_overlays, unicode_to_winansi, FontMetricsData,
95    PageOverlay, XfaRenderConfig,
96};
97use xfa_dom_resolver::data_dom::DataDom;
98use xfa_layout_engine::form::{DrawContent, FormNodeId, FormNodeStyle, FormTree};
99use xfa_layout_engine::layout::{
100    LayoutContent, LayoutDom, LayoutEngine, LayoutNode, LayoutProfile,
101};
102use xfa_layout_engine::trace::{sites as trace_sites, Reason as TraceReason};
103
104use crate::adobe_compat::{
105    cap_suppression_by_form_dom, emit_bind_none_summary, emit_non_data_widget_summary,
106    exclude_bind_none_fields_from_page_data_suppression,
107    exclude_non_data_widgets_from_page_suppression,
108    static_xfaf_excess_page_trim_with_form_dom_guard,
109    suppress_empty_pages_only_when_real_data_bound, BindNoneClassification, WidgetClassification,
110};
111
112// ---------------------------------------------------------------------------
113// XFA-F6-01 (#1109): Pipeline stage ordering contract.
114//
115// The XFA flatten pipeline must execute stages in strict order:
116//   Extract → Bind → Layout → Render → Embed → Write → Cleanup
117//
118// `debug_assert!` calls at stage boundaries verify this order at runtime in
119// debug builds. The PipelineStage enum is Ord so comparisons are cheap.
120// ---------------------------------------------------------------------------
121
122/// Ordered pipeline stages for the XFA flatten process.
123///
124/// Stages must execute in ascending order. Use `debug_assert!` at each stage
125/// boundary to verify ordering in debug builds.
126#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
127enum PipelineStage {
128    Extract = 0,
129    Bind = 1,
130    Layout = 2,
131    Render = 3,
132    Embed = 4,
133    Write = 5,
134    Cleanup = 6,
135}
136
137fn create_minimal_pdf_document() -> Document {
138    let mut doc = Document::new();
139    let pages_id = doc.add_object(Object::Dictionary(dictionary! {
140        "Type" => Object::Name(b"Pages".to_vec()),
141        "Kids" => Object::Array(vec![]),
142        "Count" => Object::Integer(0)
143    }));
144    let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
145        "Type" => Object::Name(b"Catalog".to_vec()),
146        "Pages" => Object::Reference(pages_id)
147    }));
148    doc.trailer.set("Root", Object::Reference(catalog_id));
149    doc
150}
151
152/// Layout metadata emitted only for CLI diagnostics.
153#[derive(Debug, Clone, Default)]
154pub struct LayoutDump {
155    /// pages.
156    pub pages: Vec<LayoutDumpEntry>,
157    /// dynamic_scripts.
158    pub dynamic_scripts: DynamicScriptOutcome,
159    /// output_quality.
160    pub output_quality: OutputQuality,
161}
162
163/// One page entry in the optional layout dump.
164#[derive(Debug, Clone)]
165pub struct LayoutDumpEntry {
166    /// page_num.
167    pub page_num: u32,
168    /// page_height.
169    pub page_height: f64,
170    /// used_height.
171    pub used_height: f64,
172    /// overflow_to_next.
173    pub overflow_to_next: bool,
174    /// first_overflow_element.
175    pub first_overflow_element: Option<String>,
176}
177/// FlattenMetadata.
178
179#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
180pub struct FlattenMetadata {
181    /// dynamic_scripts.
182    pub dynamic_scripts: DynamicScriptOutcome,
183    /// output_quality.
184    pub output_quality: OutputQuality,
185}
186
187impl FlattenMetadata {
188    fn from_dynamic_scripts(dynamic_scripts: DynamicScriptOutcome) -> Self {
189        Self {
190            dynamic_scripts,
191            output_quality: dynamic_scripts.output_quality,
192        }
193    }
194}
195
196struct FlattenOutput {
197    pdf_bytes: Vec<u8>,
198    layout_dump: LayoutDump,
199    metadata: FlattenMetadata,
200}
201
202impl FlattenOutput {
203    fn new(
204        pdf_bytes: Vec<u8>,
205        mut layout_dump: LayoutDump,
206        dynamic_scripts: DynamicScriptOutcome,
207    ) -> Self {
208        layout_dump.dynamic_scripts = dynamic_scripts;
209        layout_dump.output_quality = dynamic_scripts.output_quality;
210        Self {
211            pdf_bytes,
212            layout_dump,
213            metadata: FlattenMetadata::from_dynamic_scripts(dynamic_scripts),
214        }
215    }
216
217    fn without_dump(pdf_bytes: Vec<u8>) -> Self {
218        Self::new(
219            pdf_bytes,
220            LayoutDump::default(),
221            DynamicScriptOutcome::default(),
222        )
223    }
224}
225
226/// Returns `true` if the PDF bytes contain an `/Encrypt` entry in the trailer.
227pub fn is_pdf_encrypted(pdf_bytes: &[u8]) -> bool {
228    Document::load_mem(pdf_bytes)
229        .map(|doc| doc.trailer.get(b"Encrypt").is_ok())
230        .unwrap_or(false)
231}
232
233enum DecryptResult {
234    NotEncrypted,
235    Decrypted(Vec<u8>),
236    NeedsPassword,
237}
238
239/// Try to handle encryption: if not encrypted return as-is, if encrypted try
240/// empty password (owner-only encryption), otherwise report needs-password.
241fn try_decrypt_pdf(pdf_bytes: &[u8]) -> DecryptResult {
242    let mut doc = match Document::load_mem(pdf_bytes) {
243        Ok(d) => d,
244        Err(_) => return DecryptResult::NotEncrypted, // Can't parse — let downstream handle it
245    };
246
247    // lopdf auto-decrypts with empty password on load and removes /Encrypt.
248    // Use was_encrypted() to detect this — the original bytes are still encrypted
249    // and downstream parsers (pdf_syntax) can't read them.
250    if doc.was_encrypted() {
251        // Already decrypted by lopdf — save the decrypted document.
252        let mut buf = Vec::new();
253        match doc.save_to(&mut buf) {
254            Ok(()) => return DecryptResult::Decrypted(buf),
255            Err(_) => return DecryptResult::NeedsPassword,
256        }
257    }
258
259    if doc.trailer.get(b"Encrypt").is_ok() {
260        // /Encrypt present but lopdf couldn't auto-decrypt — try explicit empty password.
261        match Document::load_mem_with_password(pdf_bytes, "") {
262            Ok(mut decrypted_doc) => {
263                decrypted_doc.trailer.remove(b"Encrypt");
264                let mut buf = Vec::new();
265                match decrypted_doc.save_to(&mut buf) {
266                    Ok(()) => return DecryptResult::Decrypted(buf),
267                    Err(_) => return DecryptResult::NeedsPassword,
268                }
269            }
270            Err(_) => return DecryptResult::NeedsPassword,
271        }
272    }
273
274    DecryptResult::NotEncrypted
275}
276
277/// Returns `true` if the layout nodes contain at least one field node.
278/// Checks the FormTree source node because the layout engine may emit
279/// `WrappedText` instead of `Field` for fields with content.
280///
281/// `bind_none_count` accumulates the number of `<bind match="none">`
282/// fields encountered during the walk so the caller can emit a
283/// per-flatten summary via [`emit_bind_none_summary`]. The counter is
284/// `Cell<usize>` so the helper stays a `&` reference and the recursion
285/// remains side-effect-free apart from the increment.
286fn page_has_fields(
287    nodes: &[LayoutNode],
288    tree: &FormTree,
289    bind_none_count: &Cell<usize>,
290    widget_count: &Cell<usize>,
291) -> bool {
292    use xfa_layout_engine::form::{FieldKind, FormNodeType};
293    nodes.iter().any(|n| {
294        // Signature, button, and barcode fields carry no data value and must
295        // not count as "data fields" for page-suppression purposes.  A page
296        // whose only fields are non-data widgets (e.g. a signature-only page)
297        // must be treated as static-only and always retained.
298        // Fields with bind match="none" use template defaults and are never
299        // populated from datasets, so they are also static for this purpose.
300        let meta = tree.meta(n.form_node);
301        let is_field = matches!(tree.get(n.form_node).node_type, FormNodeType::Field { .. });
302        // M5.2b: the non-data-widget check is now expressed via
303        // `exclude_non_data_widgets_from_page_suppression`. The boolean
304        // it consumes is the same one the inline `matches!(…)` produced.
305        let field_kind_is_non_data_widget = matches!(
306            meta.field_kind,
307            FieldKind::Signature | FieldKind::Button | FieldKind::Barcode
308        );
309        let widget_class =
310            exclude_non_data_widgets_from_page_suppression(field_kind_is_non_data_widget);
311        let is_non_data_widget =
312            matches!(widget_class, WidgetClassification::ExcludedNonDataWidget);
313        if is_field && is_non_data_widget {
314            widget_count.set(widget_count.get() + 1);
315        }
316        let classification = exclude_bind_none_fields_from_page_data_suppression(
317            is_field,
318            is_non_data_widget,
319            meta.data_bind_none,
320        );
321        if matches!(classification, BindNoneClassification::ExcludedBindNone) {
322            bind_none_count.set(bind_none_count.get() + 1);
323        }
324        let is_data_field = is_field && matches!(classification, BindNoneClassification::DataField);
325        is_data_field || page_has_fields(&n.children, tree, bind_none_count, widget_count)
326    })
327}
328
329/// Returns `true` if the layout nodes contain at least one field with a
330/// non-empty value.  Checks the FormTree source node because the layout
331/// engine converts non-empty field values to `WrappedText` for line-
332/// wrapping, making `LayoutContent::Field` unreliable for data detection.
333fn page_has_field_data(nodes: &[LayoutNode], tree: &FormTree) -> bool {
334    use xfa_layout_engine::form::FormNodeType;
335    nodes.iter().any(|n| {
336        matches!(
337            &tree.get(n.form_node).node_type,
338            FormNodeType::Field { value } if !value.is_empty()
339        ) || page_has_field_data(&n.children, tree)
340    })
341}
342
343/// Flatten all XFA content in `pdf_bytes` to static PDF content streams.
344///
345/// Returns the modified PDF bytes. The /AcroForm entry is removed so the
346/// result is a plain PDF/1.4 document.
347///
348/// If the PDF has no XFA content, returns a clone of the input unchanged.
349///
350/// # Performance Target
351///
352/// P95 latency ≤ 5 seconds for 50-page documents (see
353/// `docs/XFA_SUCCESS_CRITERIA.md`).  The pipeline uses a 30-second hard
354/// timeout per document; pathological inputs fall back to `static_fallback`.
355///
356/// # Debug Logging
357///
358/// Enable debug logging with `RUST_LOG=pdf_xfa=debug`.
359///
360/// # Oracle Comparison Approach (XFA-F1-04)
361///
362/// Reference ("oracle") output for quality comparison is generated using:
363///
364/// 1. **pdfRest** — `POST https://api.pdfrest.com/flatten-pdf`
365///    Uses Adobe's XFA engine.  Highest fidelity.  Rate-limited to ~1200
366///    calls/month across two accounts.  Keys at
367///    `~/.config/pdfluent/pdfrest-keys.json`.
368///
369///    ```bash
370///    # curl -X POST "https://api.pdfrest.com/flatten-pdf" \
371///    #   -H "Api-Key: <KEY>" \
372///    #   --form "input=@form.xfa.pdf;type=application/pdf" \
373///    #   -o reference.pdfrest.pdf
374///    ```
375///
376/// 2. **mutool** — `mutool convert -o reference.pdf input.xfa.pdf`
377///    Secondary oracle.  Free, offline, limited XFA support.
378///
379///    ```bash
380///    # mutool convert -o reference.mutool.pdf input.xfa.pdf
381///    ```
382///
383/// Quality is measured as per-page SSIM vs. the pdfRest oracle (target ≥ 0.95).
384/// See `scripts/generate_xfa_reference.sh` and `docs/XFA_SUCCESS_CRITERIA.md`.
385#[must_use = "flattened PDF bytes must be used; discarding them loses output"]
386pub fn flatten_xfa_to_pdf(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
387    flatten_xfa_to_pdf_internal(pdf_bytes, false).map(|out| out.pdf_bytes)
388}
389/// flatten_xfa_to_pdf_with_layout_dump.
390#[must_use = "flattened PDF bytes and layout dump must be used; discarding them loses output"]
391pub fn flatten_xfa_to_pdf_with_layout_dump(pdf_bytes: &[u8]) -> Result<(Vec<u8>, LayoutDump)> {
392    let out = flatten_xfa_to_pdf_internal(pdf_bytes, true)?;
393    Ok((out.pdf_bytes, out.layout_dump))
394}
395/// flatten_xfa_to_pdf_with_metadata.
396#[must_use = "flattened PDF bytes and metadata must be used; discarding them loses output"]
397pub fn flatten_xfa_to_pdf_with_metadata(pdf_bytes: &[u8]) -> Result<(Vec<u8>, FlattenMetadata)> {
398    let out = flatten_xfa_to_pdf_internal(pdf_bytes, false)?;
399    Ok((out.pdf_bytes, out.metadata))
400}
401/// flatten_xfa_to_pdf_with_layout_dump_and_metadata.
402#[must_use = "flattened PDF bytes, layout dump, and metadata must be used; discarding them loses output"]
403pub fn flatten_xfa_to_pdf_with_layout_dump_and_metadata(
404    pdf_bytes: &[u8],
405) -> Result<(Vec<u8>, LayoutDump, FlattenMetadata)> {
406    let out = flatten_xfa_to_pdf_internal(pdf_bytes, true)?;
407    Ok((out.pdf_bytes, out.layout_dump, out.metadata))
408}
409
410fn flatten_xfa_to_pdf_internal(
411    pdf_bytes: &[u8],
412    collect_layout_dump: bool,
413) -> Result<FlattenOutput> {
414    // GL-QA36: Re-entrance guard.  If this function is entered while already
415    // running on this thread (depth ≥ 1), a recursive call has occurred —
416    // most likely a fallback path returning the original bytes which still
417    // contain /AcroForm + xdp:xdp markers.  Abort immediately with an error
418    // to prevent the infinite recursion / stack overflow.
419    //
420    // The worker thread spawned below has its own thread-local so its depth
421    // starts at 0 and is unaffected by this guard.
422    let depth = FLATTEN_DEPTH.with(|d| d.get());
423    if depth >= 1 {
424        return Err(XfaError::LayoutFailed(
425            "flatten_xfa_to_pdf called recursively — aborting to prevent stack overflow".into(),
426        ));
427    }
428    FLATTEN_DEPTH.with(|d| d.set(depth + 1));
429    // Drop guard: decrement the counter even if we return early.
430    struct DepthGuard;
431    impl Drop for DepthGuard {
432        fn drop(&mut self) {
433            FLATTEN_DEPTH.with(|d| d.set(d.get().saturating_sub(1)));
434        }
435    }
436    let _depth_guard = DepthGuard;
437
438    // 0a. Quick byte-level pre-check: if the raw bytes don't contain /AcroForm
439    //     (where XFA lives per the spec) and no XDP namespace, skip expensive
440    //     parsing. This prevents multi-second stalls on large non-XFA PDFs.
441    if !pdf_bytes.windows(9).any(|w| w == b"/AcroForm")
442        && !pdf_bytes.windows(7).any(|w| w == b"xdp:xdp")
443    {
444        return Ok(FlattenOutput::without_dump(pdf_bytes.to_vec()));
445    }
446
447    // 0b. Handle encrypted PDFs: try empty-password decrypt (owner-only encryption),
448    //     otherwise reject early — encrypted content produces garbage output.
449    let decrypted;
450    let pdf_bytes = match try_decrypt_pdf(pdf_bytes) {
451        DecryptResult::NotEncrypted => pdf_bytes,
452        DecryptResult::Decrypted(bytes) => {
453            decrypted = bytes;
454            &decrypted
455        }
456        DecryptResult::NeedsPassword => {
457            return Err(XfaError::Encrypted(
458                "PDF is encrypted and requires a password".into(),
459            ));
460        }
461    };
462
463    // 1. Extract XFA packets.
464    let packets = match extract_xfa_from_bytes(pdf_bytes.to_vec()) {
465        Ok(p) => p,
466        Err(_) => {
467            // No XFA packet was extracted, but the byte-level pre-check already
468            // established that the document carries /AcroForm or XFA markers.
469            // Fall back to static cleanup so AcroForm-only inputs still flatten.
470            return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
471        }
472    };
473
474    let template_xml = match packets.template() {
475        Some(t) => strip_undefined_xml_entities(t),
476        None => {
477            // XFA present but template packet missing/unparseable (truncated XML).
478            // Strip AcroForm + NeedsRendering so renderers use static content.
479            trace_sites::fallback(
480                TraceReason::StaticFallbackTaken,
481                "template packet missing or unparseable",
482            );
483            return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
484        }
485    };
486
487    // 1b. Detect corrupt/minimal XFA: tiny PDFs (<1KB) whose template has no
488    //     real content (no <subform> or <pageSet> children) produce blank output.
489    //     Fall back to static page copy so the original pages are preserved.
490    if is_corrupt_xfa_template(pdf_bytes.len(), &template_xml) {
491        trace_sites::fallback(
492            TraceReason::StaticFallbackTaken,
493            "corrupt or minimal XFA template",
494        );
495        return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
496    }
497
498    // 2. Try XFA template → layout → render pipeline.
499    //    If this fails (parse error, empty template, layout 0 pages, lopdf error),
500    //    fall back to preserving the existing page content with AcroForm stripped.
501    //
502    //    Wrap in a thread-based timeout (30s) to prevent hangs on pathological
503    //    XFA documents. If the timeout fires, the join handle's result is an Err
504    //    and we fall back to static_fallback.
505    const FLATTEN_TIMEOUT: Duration = Duration::from_secs(30);
506    let pdf_bytes_ref = pdf_bytes.to_vec();
507    let template_xml_owned = template_xml.clone();
508    let datasets_xml_owned = packets.datasets().map(strip_undefined_xml_entities);
509    let form_xml_owned = packets.get_packet("form").map(|s| s.to_string());
510
511    let handle = thread::spawn(move || {
512        xfa_flatten_inner(
513            &pdf_bytes_ref,
514            &template_xml_owned,
515            datasets_xml_owned.as_deref(),
516            form_xml_owned.as_deref(),
517            collect_layout_dump,
518        )
519    });
520
521    match handle.join() {
522        Ok(Ok(out)) => Ok(out),
523        Ok(Err(e @ XfaError::UnsupportedFeature(_))) => Err(e),
524        Ok(Err(e)) => {
525            eprintln!("XFA flatten failed: {e:?}");
526            trace_sites::fallback(
527                TraceReason::StaticFallbackTaken,
528                format!("inner pipeline error: {e:?}"),
529            );
530            static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
531        }
532        Err(_) => {
533            eprintln!("XFA flatten timed out after {:?}", FLATTEN_TIMEOUT);
534            trace_sites::fallback(TraceReason::StaticFallbackTaken, "inner pipeline timeout");
535            static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
536        }
537    }
538}
539
540/// Core XFA flatten pipeline: parse template, bind data, layout, render.
541fn xfa_flatten_inner(
542    pdf_bytes: &[u8],
543    template_xml: &str,
544    datasets_xml: Option<&str>,
545    form_xml: Option<&str>,
546    collect_layout_dump: bool,
547) -> Result<FlattenOutput> {
548    // XFA-F6-01 (#1109): pipeline stage tracker — verifies strict ordering via
549    // debug_assert in each stage transition below.
550    let mut _stage = PipelineStage::Extract;
551
552    // PIPELINE: stage 0 — Extract (parse datasets and image files from PDF)
553    log::debug!(
554        "XFA flatten: {} bytes input, template={} bytes",
555        pdf_bytes.len(),
556        template_xml.len()
557    );
558
559    let data_dom = if let Some(ds_xml) = datasets_xml {
560        DataDom::from_xml(ds_xml)
561            .map_err(|e| XfaError::ParseFailed(format!("datasets parse: {e}")))?
562    } else {
563        DataDom::new()
564    };
565
566    // Extract embedded image files from the PDF for resolving <image href="…">
567    // references in the XFA template (XFA §2.3).
568    let image_files = match Document::load_mem(pdf_bytes) {
569        Ok(doc) => extract_embedded_images(&doc),
570        Err(_) => HashMap::new(),
571    };
572
573    // XFA-F9-02 (#1121): Graceful degradation — warn on unsupported features
574    // instead of failing silently.  These checks run once per document after
575    // template extraction so they add negligible overhead.
576    if template_xml.contains("barcode") {
577        log::warn!("XFA barcode elements found but not supported — rendered as empty boxes");
578    }
579    if template_xml.contains("<signature") || template_xml.contains("<Signature") {
580        log::warn!("XFA signature elements found but not supported — elements skipped");
581    }
582    if javascript_policy::template_mentions_javascript(template_xml) {
583        log::warn!(
584            "{}",
585            javascript_policy::execution_denied_message(JavaScriptEntryPoint::XfaEventHook)
586        );
587    }
588
589    // PIPELINE: stage 1 — Bind (merge template with data DOM)
590    debug_assert!(
591        _stage <= PipelineStage::Bind,
592        "pipeline stage order violated: expected <= Bind"
593    );
594    _stage = PipelineStage::Bind;
595
596    let merger = FormMerger::new(&data_dom).with_image_files(image_files);
597    let (mut tree, root_id) = merger
598        .merge(template_xml)
599        .map_err(|e| XfaError::ParseFailed(format!("template merge: {e}")))?;
600
601    log::debug!("XFA bind: {} form nodes created", tree.nodes.len());
602
603    // UX1: emit one (bind, …) trace event summarising the result of the
604    // template+data merge. The event records whether any visible data
605    // binding occurred — the same boolean (`tree.any_data_bound`) that
606    // gates the page-suppression heuristic later on. Sink is off by
607    // default; the call costs one thread-local read when no sink is
608    // installed.
609    let bind_reason = if tree.any_data_bound {
610        TraceReason::SubformMaterialisedFromData
611    } else {
612        TraceReason::SubformSuppressedNoData
613    };
614    trace_sites::bind(
615        "root",
616        bind_reason,
617        format!(
618            "form_nodes={} any_data_bound={}",
619            tree.nodes.len(),
620            tree.any_data_bound
621        ),
622    );
623
624    // M3-B Phase C validation hook (2026-05-03):
625    // Allow operators (CLI, integration tests, cohort runs) to engage the
626    // sandboxed JavaScript runtime by setting `XFA_JS_EXECUTION_MODE`.
627    // - unset / "default" / "best_effort_static" → existing default
628    //   (`BestEffortStatic`), no behaviour change for any existing user.
629    // - "strict" → `Strict` (M8 `DENY_EXECUTION`).
630    // - "sandboxed" / "sandboxed_runtime" → `SandboxedRuntime` (Phase B+C).
631    //   Only effective when the `xfa-js-sandboxed` Cargo feature is compiled
632    //   in; otherwise NullRuntime returns NotCompiledIn and the dispatch
633    //   path falls back to the same skip behaviour as `BestEffortStatic`.
634    let dynamic_scripts = match std::env::var("XFA_JS_EXECUTION_MODE")
635        .ok()
636        .map(|s| s.to_ascii_lowercase())
637        .as_deref()
638    {
639        Some("strict") => {
640            apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::Strict)?
641        }
642        Some("sandboxed") | Some("sandboxed_runtime") => {
643            // Phase D-γ: create the runtime manually so we can call
644            // `set_data_handle` before script execution, making the DataDom
645            // accessible from `$record` and `xfa.resolveNodes("data.*")`.
646            #[cfg(feature = "xfa-js-sandboxed")]
647            {
648                use crate::js_runtime::{NullRuntime, QuickJsRuntime, XfaJsRuntime};
649                match QuickJsRuntime::new() {
650                    Ok(mut rt) => {
651                        rt.set_data_handle(&data_dom as *const _);
652                        apply_dynamic_scripts_with_runtime(
653                            &mut tree,
654                            root_id,
655                            JsExecutionMode::SandboxedRuntime,
656                            &mut rt,
657                        )?
658                    }
659                    Err(_) => apply_dynamic_scripts_with_runtime(
660                        &mut tree,
661                        root_id,
662                        JsExecutionMode::SandboxedRuntime,
663                        &mut NullRuntime::new(),
664                    )?,
665                }
666            }
667            #[cfg(not(feature = "xfa-js-sandboxed"))]
668            apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::SandboxedRuntime)?
669        }
670        _ => apply_dynamic_scripts(&mut tree, root_id)?,
671    };
672    if dynamic_scripts.output_quality != OutputQuality::Exact {
673        // M3-B Phase C (2026-05-03): appended host-binding counters after
674        // the Phase B JS runtime counters. Defaults stay 0 in
675        // `BestEffortStatic` mode so existing log parsers remain compatible.
676        log::warn!(
677            "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={}",
678            dynamic_scripts.output_quality.as_str(),
679            dynamic_scripts.js_present,
680            dynamic_scripts.js_skipped,
681            dynamic_scripts.other_skipped,
682            dynamic_scripts.formcalc_run,
683            dynamic_scripts.formcalc_errors,
684            dynamic_scripts.js_executed,
685            dynamic_scripts.js_runtime_errors,
686            dynamic_scripts.js_timeouts,
687            dynamic_scripts.js_oom,
688            dynamic_scripts.js_host_calls,
689            dynamic_scripts.js_mutations,
690            dynamic_scripts.js_instance_writes,
691            dynamic_scripts.js_list_writes,
692            dynamic_scripts.js_binding_errors,
693            dynamic_scripts.js_resolve_failures,
694            dynamic_scripts.js_data_reads,
695        );
696        eprintln!(
697            "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={}",
698            dynamic_scripts.output_quality.as_str(),
699            dynamic_scripts.js_present,
700            dynamic_scripts.js_skipped,
701            dynamic_scripts.other_skipped,
702            dynamic_scripts.formcalc_run,
703            dynamic_scripts.formcalc_errors,
704            dynamic_scripts.js_executed,
705            dynamic_scripts.js_runtime_errors,
706            dynamic_scripts.js_timeouts,
707            dynamic_scripts.js_oom,
708            dynamic_scripts.js_host_calls,
709            dynamic_scripts.js_mutations,
710            dynamic_scripts.js_instance_writes,
711            dynamic_scripts.js_list_writes,
712            dynamic_scripts.js_binding_errors,
713            dynamic_scripts.js_resolve_failures,
714            dynamic_scripts.js_data_reads,
715        );
716    }
717
718    // XFA §3: when the PDF contains a pre-merged form DOM (saved by Adobe's
719    // runtime after scripts executed), use its presence attributes to override
720    // the template-based defaults. This captures script-driven visibility
721    // changes (e.g. Avoka framework's sfcUtils.updateVisibility) that our
722    // FormCalc interpreter cannot execute.
723    if let Some(fxml) = form_xml {
724        apply_form_dom_presence(&mut tree, root_id, fxml);
725    }
726
727    // Resolve fonts BEFORE layout so the layout engine uses actual font metrics
728    // (widths, ascender, descender) instead of generic AFM tables.
729    let resolved_fonts = resolve_template_fonts(template_xml, pdf_bytes);
730    inject_resolved_metrics(&mut tree, &resolved_fonts);
731
732    // PIPELINE: stage 2 — Layout (compute page positions using resolved font metrics)
733    debug_assert!(
734        _stage <= PipelineStage::Layout,
735        "pipeline stage order violated: expected <= Layout"
736    );
737    _stage = PipelineStage::Layout;
738
739    let (mut layout, mut layout_dump) = {
740        let engine = LayoutEngine::new(&tree);
741        if collect_layout_dump {
742            let (layout, profile) = engine
743                .layout_with_profile(root_id)
744                .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
745            (layout, Some(layout_dump_from_profile(profile)))
746        } else {
747            let layout = engine
748                .layout(root_id)
749                .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
750            (layout, None)
751        }
752    };
753
754    if layout.pages.is_empty() {
755        return Err(XfaError::LayoutFailed("layout produced 0 pages".into()));
756    }
757
758    log::debug!("XFA layout: {} pages produced", layout.pages.len());
759
760    // UX1: emit one (paginate, …) trace event summarising the layout
761    // engine output. The available_h / needed_h channels carry the page
762    // count and form-DOM target so trace consumers can reconstruct the
763    // pagination decision without re-parsing the layout dump.
764    let form_dom_pages = form_xml.and_then(form_dom_page_count).unwrap_or(0);
765    trace_sites::paginate(
766        "root",
767        TraceReason::PaginateFitsCurrentPage,
768        layout.pages.len() as f64,
769        form_dom_pages as f64,
770    );
771
772    // Two-pass recovery: when the form DOM presence application causes
773    // under-pagination relative to the declared form_dom_page_count, re-run
774    // layout without form DOM presence overrides.
775    //
776    // Context: `apply_form_dom_presence` suppresses named template subforms
777    // that are absent from the form DOM (script-driven conditional sections).
778    // When data binding is absent (any_data_bound=false) or when the form DOM
779    // only records one rendering branch of a multi-branch template (e.g. a
780    // Subform_Core / Subform_pdfHTML pair where only Core appears in the form
781    // DOM), the suppression can remove structural content that was present in
782    // the original rendering, reducing page count below what the form DOM
783    // itself declares.
784    //
785    // Guard: trigger when the declared form DOM page count is higher than what
786    // the first pass produced.  Adopt the template-only result ONLY if it
787    // produces exactly `fdp_count` pages — this prevents over-adoption when
788    // form DOM has boilerplate pageArea declarations that were never all
789    // rendered (e.g. large application forms with 18 declared pageAreas but
790    // only 3–5 actually used).
791    if let Some(fdp_count) = form_xml.and_then(form_dom_page_count) {
792        if layout.pages.len() < fdp_count {
793            log::debug!(
794                "XFA layout: form_dom declared {} pages but layout produced {} — \
795                 re-running without form-dom presence overrides",
796                fdp_count,
797                layout.pages.len(),
798            );
799            let image_files2 = match lopdf::Document::load_mem(pdf_bytes) {
800                Ok(doc) => extract_embedded_images(&doc),
801                Err(_) => HashMap::new(),
802            };
803            let merge_result2 = FormMerger::new(&data_dom)
804                .with_image_files(image_files2)
805                .merge(template_xml)
806                .map_err(|e| XfaError::ParseFailed(format!("template re-merge: {e}")));
807            if let Ok((mut tree2, root_id2)) = merge_result2 {
808                inject_resolved_metrics(&mut tree2, &resolved_fonts);
809                let layout2_result = {
810                    let engine2 = LayoutEngine::new(&tree2);
811                    if collect_layout_dump {
812                        engine2
813                            .layout_with_profile(root_id2)
814                            .map(|(l, p)| (l, Some(layout_dump_from_profile(p))))
815                            .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))
816                    } else {
817                        engine2
818                            .layout(root_id2)
819                            .map(|l| (l, None))
820                            .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))
821                    }
822                };
823                if let Ok((layout2, layout_dump2)) = layout2_result {
824                    // Adopt only when the template layout lands exactly on the
825                    // form DOM page count — this is the strongest possible signal
826                    // that the template structure is fully consistent with the
827                    // original rendering.
828                    if layout2.pages.len() > layout.pages.len() && layout2.pages.len() == fdp_count
829                    {
830                        tree = tree2;
831                        layout = layout2;
832                        layout_dump = layout_dump2;
833                        log::debug!(
834                            "XFA layout: re-run produced {} pages — using template-only layout",
835                            layout.pages.len()
836                        );
837                    }
838                }
839            }
840        }
841    }
842
843    // XFA Spec §4.3: suppress page subforms whose data is empty or absent.
844    // A page with fields but no populated values is considered "data-empty"
845    // and should be suppressed.  Pages without fields (static-only pages with
846    // draws/images) are always kept.  At least one page is retained.
847    //
848    // Guard: only suppress when the merger actually bound at least one field
849    // value from the DataDom.  When no binding occurred (e.g. the datasets
850    // packet contains only server-infrastructure metadata without any nodes
851    // matching template field names), all field values come from template
852    // defaults and the suppression heuristic must not fire — doing so would
853    // incorrectly drop structurally-required template pages.
854    //
855    // Cap: when the PDF has a pre-saved form DOM (written by Adobe Reader after
856    // script execution), its page-area count is the authoritative rendered page
857    // count.  We may suppress at most (layout_pages - form_dom_pages) pages so
858    // that pages which are structurally required but happen to be data-empty
859    // (e.g. alarm-configuration sections the user hasn't filled in yet) are
860    // never incorrectly dropped.
861    // M5.2: `suppress_empty_pages_only_when_real_data_bound` is now the
862    // executable preflight gate. The decision is bit-identical to the
863    // inline `if layout.pages.len() > 1 && tree.any_data_bound { … }` it
864    // replaces; emits one trace event per branch (gated/skipped) so
865    // consumers can observe the decision.
866    let preflight =
867        suppress_empty_pages_only_when_real_data_bound(layout.pages.len(), tree.any_data_bound);
868    if preflight.run_suppression {
869        // UX1: form-DOM cap is expressed as an executable Adobe-compat
870        // rule (`cap_suppression_by_form_dom`). The function returns the
871        // upper bound on dropped pages and emits a
872        // (suppress, SuppressCappedByFormDom) trace event when the cap is
873        // consulted. Behaviour is bit-identical to the prior inline match.
874        let cap_decision =
875            cap_suppression_by_form_dom(layout.pages.len(), form_xml.and_then(form_dom_page_count));
876        let max_suppress = cap_decision.max_suppress;
877
878        // M5.2: per-flatten counter for the bind-none exclusion summary.
879        // The counter is filled by `page_has_fields` and reported once
880        // after the scan via `emit_bind_none_summary`.
881        let bind_none_count: Cell<usize> = Cell::new(0);
882        // M5.2b: same pattern for the non-data-widget rule; emitted via
883        // `emit_non_data_widget_summary`.
884        let widget_count: Cell<usize> = Cell::new(0);
885
886        let mut suppressed = 0usize;
887        let keep: Vec<bool> = layout
888            .pages
889            .iter()
890            .enumerate()
891            .map(|(page_index, p)| {
892                if page_has_fields(&p.nodes, &tree, &bind_none_count, &widget_count)
893                    && !page_has_field_data(&p.nodes, &tree)
894                    && suppressed < max_suppress
895                {
896                    suppressed += 1;
897                    trace_sites::suppress(
898                        TraceReason::SuppressEmptyDataPageDropped,
899                        page_index as u32,
900                        "data_empty_page_dropped",
901                    );
902                    false
903                } else {
904                    true
905                }
906            })
907            .collect();
908
909        emit_bind_none_summary(bind_none_count.get());
910        emit_non_data_widget_summary(widget_count.get());
911        let any_keep = keep.iter().any(|&k| k);
912        if any_keep {
913            let mut idx = 0;
914            layout.pages.retain(|_| {
915                let k = keep[idx];
916                idx += 1;
917                k
918            });
919            if let Some(ref mut dump) = layout_dump {
920                let mut idx = 0;
921                dump.pages.retain(|_| {
922                    let k = keep[idx];
923                    idx += 1;
924                    k
925                });
926            }
927        }
928        // When NO page has data, keep all pages: the form is empty and
929        // all structural pages should be preserved (e.g. a 6-page
930        // inspection report with no filled-in values).
931    }
932
933    if let Some(ref mut dump) = layout_dump {
934        renumber_layout_dump_pages(dump);
935    }
936
937    // PIPELINE: stage 3 — Render (generate XFA overlay content streams from layout)
938    debug_assert!(
939        _stage <= PipelineStage::Render,
940        "pipeline stage order violated: expected <= Render"
941    );
942    _stage = PipelineStage::Render;
943
944    let mut doc = match Document::load_mem(pdf_bytes) {
945        Ok(d) => d,
946        Err(_) => {
947            eprintln!("lopdf load failed, creating minimal PDF structure for XFA layout");
948            create_minimal_pdf_document()
949        }
950    };
951
952    // PIPELINE: stage 4 — Embed (embed fonts/images into PDF document)
953    debug_assert!(
954        _stage <= PipelineStage::Embed,
955        "pipeline stage order violated: expected <= Embed"
956    );
957    _stage = PipelineStage::Embed;
958
959    // PERF: embed_resolved_fonts is O(f * p) where f = unique resolved fonts
960    // and p = PDF pages.  Each font requires a full font-program copy into the
961    // PDF object stream plus /Widths array serialisation.  For documents with
962    // many embedded fonts and many pages this is the dominant allocation source.
963    // Potential optimisation: share font objects across pages (already done for
964    // standard Type1 fonts F1-F3; extend to TrueType/CID fonts).
965    let (font_map, embedded_font_objects, metrics_data) =
966        embed_resolved_fonts(&mut doc, &resolved_fonts, &layout);
967
968    let config = XfaRenderConfig {
969        font_map,
970        font_metrics_data: metrics_data,
971        ..Default::default()
972    };
973
974    let overlays = generate_all_overlays(&layout, &config)
975        .map_err(|e| XfaError::LayoutFailed(format!("overlay generation: {e:?}")))?;
976
977    log::debug!(
978        "XFA render: {} content streams generated ({} bytes total)",
979        overlays.len(),
980        overlays
981            .iter()
982            .map(|o| o.content_stream.len())
983            .sum::<usize>()
984    );
985
986    // Register standard PDF fonts: F1=Times-Roman (serif), F2=Helvetica (sans), F3=Courier (mono).
987    let font_ids: [ObjectId; 3] = [
988        doc.add_object(Object::Dictionary(dictionary! {
989            "Type"     => Object::Name(b"Font".to_vec()),
990            "Subtype"  => Object::Name(b"Type1".to_vec()),
991            "BaseFont" => Object::Name(b"Times-Roman".to_vec()),
992            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
993        })),
994        doc.add_object(Object::Dictionary(dictionary! {
995            "Type"     => Object::Name(b"Font".to_vec()),
996            "Subtype"  => Object::Name(b"Type1".to_vec()),
997            "BaseFont" => Object::Name(b"Helvetica".to_vec()),
998            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
999        })),
1000        doc.add_object(Object::Dictionary(dictionary! {
1001            "Type"     => Object::Name(b"Font".to_vec()),
1002            "Subtype"  => Object::Name(b"Type1".to_vec()),
1003            "BaseFont" => Object::Name(b"Courier".to_vec()),
1004            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1005        })),
1006    ];
1007
1008    let existing_page_ids: Vec<ObjectId> = doc.page_iter().collect();
1009    let n_layout = overlays.len();
1010    let n_existing = existing_page_ids.len();
1011
1012    // XFA Spec 3.3 §9.1 — Static vs Dynamic Forms: a form is static (XFAF)
1013    // when it uses only the restricted XFAF grammar subset (§7.6).  In
1014    // practice, Adobe identifies static forms by `baseProfile="interactiveForms"`
1015    // on the <template> element.  A dynamic form uses the full XFA grammar
1016    // and re-lays out content based on data/scripts.
1017    //
1018    // §7.6 enumerates grammar excluded from XFAF: area, occur (non-default),
1019    // multiple pageAreas, scripts that modify instance count, etc.
1020    //
1021    // Our detection uses baseProfile — this matches Adobe's behavior.  A more
1022    // rigorous check would inspect the template grammar for XFAF-excluded
1023    // elements, but baseProfile is the standard signal in real-world PDFs.
1024    let is_static_form = template_xml.contains("baseProfile=\"interactiveForms\"");
1025    let has_static_content = pages_have_static_content(&doc);
1026
1027    // Preserve pre-rendered PDF page content when:
1028    // 1. Explicit static form (baseProfile="interactiveForms"), OR
1029    // 2. Pages have substantial pre-rendered content AND the XFA layout
1030    //    produces at least as many pages as the original AND the XFA overlay
1031    //    has enough content to indicate a full page re-render.
1032    // 3. Layout engine produces fewer pages than the original — regardless
1033    //    of whether we detect static content in page streams. XFA PDFs often
1034    //    have form content in widget annotations rather than page content
1035    //    streams, so `has_static_content` may return false even when pages
1036    //    have substantial pre-rendered form content. When our layout is
1037    //    incomplete (fewer pages), preserving the original pages matches
1038    //    Adobe/pdfRest output better than truncated single-page output.
1039    //
1040    //    When the XFA overlay is minimal (e.g. just a title/header), the form
1041    //    relies on AcroForm widgets for its content. Preserving static content
1042    //    + baking widgets adds spurious form fields. Using the XFA path gives
1043    //    the correct minimal output matching pdfrest/Adobe behavior.
1044    //    When the XFA overlay is substantial (re-renders the full page), the
1045    //    pre-rendered content is authoritative — replacing it with XFA causes
1046    //    SSIM regressions due to font/rendering differences.
1047    //
1048    // The 1000-byte threshold separates minimal XFA templates (title/header
1049    // only, ~200-500 bytes) from full page re-renders (5000+ bytes).
1050    let overlay_is_substantial = overlays.iter().any(|o| o.content_stream.len() > 1000);
1051    // Clamp 1-page over-pagination only for static XFAF forms. Dynamic forms
1052    // often start from a 1-page placeholder PDF and legitimately flow onto
1053    // additional pages once XFA data is laid out. Clamping all 1-page inputs
1054    // to the original page count causes under-pagination on dynamic forms such
1055    // as Travel Expense Report / Checklist where Adobe renders 2-3 pages.
1056    //
1057    // When the layout produces MORE pages than the original PDF has, the form is
1058    // clearly dynamic — it needs to overflow onto new pages. In that case the
1059    // "has pre-rendered static content + substantial overlay" heuristic must not
1060    // fire, because the original single-page placeholder is not authoritative
1061    // for multi-record dynamic forms.
1062    let preserve_static = is_static_form
1063        || n_layout < n_existing
1064        || (n_layout <= n_existing && has_static_content && overlay_is_substantial);
1065
1066    // PIPELINE: stage 5 — Write (write content streams to PDF pages)
1067    debug_assert!(
1068        _stage <= PipelineStage::Write,
1069        "pipeline stage order violated: expected <= Write"
1070    );
1071    _stage = PipelineStage::Write;
1072
1073    if preserve_static {
1074        let baked = flatten_widget_appearances(&mut doc);
1075        if baked == 0 {
1076            // No widget APs were baked — the form structure lives in the
1077            // pre-rendered page content but field values exist only in the
1078            // XFA overlay.  Generate a lightweight overlay with just field
1079            // value text (no backgrounds/borders/captions) and append it
1080            // on top so field values become visible without visual artifacts.
1081            if let Ok(fv_overlays) = generate_field_values_overlays(&layout, &config) {
1082                for (i, overlay) in fv_overlays.iter().enumerate() {
1083                    if i < n_existing && !overlay.content_stream.is_empty() {
1084                        let _ = overlay_page_content(
1085                            &mut doc,
1086                            existing_page_ids[i],
1087                            overlay,
1088                            &font_ids,
1089                            &embedded_font_objects,
1090                        );
1091                    }
1092                }
1093            }
1094        }
1095        // When widgets WERE baked, their AP streams already contain field
1096        // content.  Overlaying XFA on top of baked widget appearances
1097        // causes ghost/double text because widget APs may contain rotation
1098        // matrices that produce differently-positioned text.
1099    } else {
1100        // Dynamic form: the layout engine determines page count.
1101        // Write each layout page to the output: overwrite existing pages
1102        // and add new pages when the layout produces more than the original.
1103        // NOTE: page cap (n_layout.min(n_existing)) was removed — it caused
1104        // 30 GATE #12 regressions because dynamic XFA forms often have a
1105        // single placeholder page while the actual form has many data-driven
1106        // pages. Capping to n_existing destroyed multi-page content.
1107        for (i, overlay) in overlays.iter().enumerate() {
1108            if i < n_existing {
1109                let lp = &layout.pages[i];
1110                write_page_content(
1111                    &mut doc,
1112                    existing_page_ids[i],
1113                    overlay,
1114                    &font_ids,
1115                    &embedded_font_objects,
1116                    Some(lp.width),
1117                    Some(lp.height),
1118                )?;
1119            } else {
1120                let lp = &layout.pages[i];
1121                add_new_page(
1122                    &mut doc,
1123                    lp.width,
1124                    lp.height,
1125                    overlay,
1126                    &font_ids,
1127                    &embedded_font_objects,
1128                )?;
1129            }
1130        }
1131
1132        // Bake checkbox/radio AP marks from AcroForm widgets onto existing
1133        // pages.  The XFA overlay draws borders and captions; the AP "on"
1134        // stream adds the filled mark (circle, checkmark, etc.) that the
1135        // oracle renders for hybrid forms (#886).
1136        for &page_id in &existing_page_ids[..n_existing.min(n_layout)] {
1137            bake_checkbox_radio_ap_marks(&mut doc, page_id);
1138        }
1139    }
1140
1141    // Remove excess pages when XFA layout produces fewer pages than the
1142    // original static content. This is the core fix for over-pagination
1143    // (#744): XFA PDFs often carry pre-rendered static pages that far exceed
1144    // the dynamic page count Adobe would produce.
1145    // For dynamic hybrid forms (preserve_static via has_static), keep all pages
1146    // because their content lives in existing PDF streams (#750).
1147    // Exception: static XFAF forms (baseProfile="interactiveForms") have a
1148    // fixed template-declared page count.  Extra pages in the original PDF are
1149    // surplus placeholders from a previous rendering and must be trimmed — BUT
1150    // only when the form DOM does not indicate more pages are expected.  When the
1151    // form DOM declares more pages than the layout produced (e.g. presence-based
1152    // suppression hid some pages), the form DOM count is authoritative and excess
1153    // trimming must not reduce below the form-DOM page count.
1154    //
1155    // XFA §7.6 / §9.1: an XFAF form excludes script execution and instance
1156    // mutation from its grammar.  When the template contains zero `<script>`
1157    // blocks and zero FormCalc AND the layout collapses to a single page, the
1158    // form-DOM page count cannot be hiding multi-page content from us — the
1159    // form trivially fits on one page.  In that narrow situation, host pages
1160    // beyond page 1 are stale placeholders that must be trimmed to match Adobe.
1161    //
1162    // The single-page guard is intentionally conservative: it avoids regressing
1163    // forms whose layout legitimately under-produces (e.g. 7-of-17 pages) by
1164    // limiting the relaxation to layouts that produced exactly one page.
1165    // M5.2: `static_xfaf_excess_page_trim_with_form_dom_guard` is now the
1166    // executable rule. The function returns the same `allow_trim` decision
1167    // the inline expression produced for every input, plus a trace
1168    // anchor (suppress, static_xfaf_trim_allowed / _blocked). Behaviour
1169    // is bit-identical; the regression guards
1170    // `corpus_7dbbe9d9_one_page`, `corpus_322faac4_seventeen_pages`,
1171    // and `corpus_fe5de953_one_page` continue to produce their oracle
1172    // page counts.
1173    let template_has_dynamic_logic = template_xml.contains("<script")
1174        || template_xml.contains(r#"contentType="application/x-formcalc""#);
1175    let trim_decision = static_xfaf_excess_page_trim_with_form_dom_guard(
1176        is_static_form,
1177        template_has_dynamic_logic,
1178        n_layout,
1179        form_xml.and_then(form_dom_page_count),
1180    );
1181    let static_can_trim = trim_decision.allow_trim;
1182    if n_layout < n_existing && (!preserve_static || static_can_trim) {
1183        // delete_pages takes 1-indexed page numbers, highest first to avoid
1184        // index shifts.
1185        let excess: Vec<u32> = ((n_layout + 1) as u32..=(n_existing as u32))
1186            .rev()
1187            .collect();
1188        doc.delete_pages(&excess);
1189    }
1190
1191    if is_static_form {
1192        // Static forms: strip Widget annotations but keep non-Widget (links,
1193        // stamps, etc.).  flatten_widget_appearances already baked widgets
1194        // with AP into the page content and removed them from Annots, but
1195        // widgets without AP may remain.  Remove those too so PDF viewers
1196        // don't render interactive fields over the baked content.
1197        for &page_id in &existing_page_ids {
1198            strip_widget_annotations(&mut doc, page_id);
1199        }
1200    } else {
1201        // Dynamic/hybrid forms: strip ALL annotations — pages were
1202        // overwritten by XFA layout or widget baking covered field values.
1203        for &page_id in existing_page_ids.iter().take(n_layout.min(n_existing)) {
1204            if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
1205                dict.remove(b"Annots");
1206            }
1207        }
1208    }
1209
1210    // PIPELINE: stage 6 — Cleanup (remove AcroForm/XFA markers)
1211    debug_assert!(
1212        _stage <= PipelineStage::Cleanup,
1213        "pipeline stage order violated: expected <= Cleanup"
1214    );
1215    #[allow(unused_assignments)]
1216    {
1217        _stage = PipelineStage::Cleanup;
1218    }
1219
1220    remove_acroform(&mut doc);
1221    let stripped_js = javascript_policy::strip_javascript_for_flatten(&mut doc);
1222    if stripped_js > 0 {
1223        log::warn!("stripped {stripped_js} JavaScript action(s) from flattened output");
1224    }
1225
1226    let mut out = Vec::new();
1227    doc.save_to(&mut out)
1228        .map_err(|e| XfaError::LayoutFailed(format!("save: {e}")))?;
1229    Ok(FlattenOutput::new(
1230        out,
1231        layout_dump.unwrap_or_default(),
1232        dynamic_scripts,
1233    ))
1234}
1235
1236fn layout_dump_from_profile(profile: LayoutProfile) -> LayoutDump {
1237    LayoutDump {
1238        pages: profile
1239            .pages
1240            .into_iter()
1241            .enumerate()
1242            .map(|(idx, page)| LayoutDumpEntry {
1243                page_num: idx as u32 + 1,
1244                page_height: page.page_height,
1245                used_height: page.used_height,
1246                overflow_to_next: page.overflow_to_next,
1247                first_overflow_element: page.first_overflow_element,
1248            })
1249            .collect(),
1250        ..Default::default()
1251    }
1252}
1253
1254fn renumber_layout_dump_pages(dump: &mut LayoutDump) {
1255    for (idx, page) in dump.pages.iter_mut().enumerate() {
1256        page.page_num = idx as u32 + 1;
1257    }
1258}
1259
1260// ---------------------------------------------------------------------------
1261// Embedded image files extraction (XFA §2.3 href resolution)
1262// ---------------------------------------------------------------------------
1263
1264/// Extract embedded files from the PDF's Names/EmbeddedFiles tree.
1265///
1266/// XFA `<image href=".\filename.jpg">` references are resolved against this
1267/// tree at merge time (XFA Spec 3.3 §2.3).  The returned map is keyed by
1268/// the filename as it appears in the Names array (e.g. `.\lintje.jpg`).
1269fn extract_embedded_images(doc: &Document) -> HashMap<String, Vec<u8>> {
1270    let mut images = HashMap::new();
1271
1272    // Helper: resolve a potentially indirect object.
1273    fn deref_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
1274        match obj {
1275            Object::Reference(id) => doc.get_dictionary(*id).ok(),
1276            Object::Dictionary(d) => Some(d),
1277            _ => None,
1278        }
1279    }
1280
1281    // Helper: extract stream content (decompressed).
1282    fn extract_stream(doc: &Document, obj: &Object) -> Option<Vec<u8>> {
1283        let stream_obj = match obj {
1284            Object::Reference(id) => doc.get_object(*id).ok()?,
1285            other => other,
1286        };
1287        if let Object::Stream(ref stream) = *stream_obj {
1288            let mut s = stream.clone();
1289            let _ = s.decompress();
1290            Some(s.content.clone())
1291        } else {
1292            None
1293        }
1294    }
1295
1296    // Traverse: Catalog → /Names → /EmbeddedFiles → /Names array
1297    let catalog = match doc.catalog() {
1298        Ok(c) => c,
1299        Err(_) => return images,
1300    };
1301    let names_obj = match catalog.get(b"Names") {
1302        Ok(obj) => obj,
1303        Err(_) => {
1304            eprintln!("[img-href] no /Names in catalog");
1305            return images;
1306        }
1307    };
1308    let names_dict = match deref_dict(doc, names_obj) {
1309        Some(d) => d,
1310        None => return images,
1311    };
1312    // XFA PDFs may use /XFAImages instead of /EmbeddedFiles for image
1313    // references.  Check both keys.
1314    let ef_obj = match names_dict
1315        .get(b"XFAImages")
1316        .or_else(|_| names_dict.get(b"EmbeddedFiles"))
1317    {
1318        Ok(obj) => obj,
1319        Err(_) => return images,
1320    };
1321    let ef_dict = match deref_dict(doc, ef_obj) {
1322        Some(d) => d,
1323        None => return images,
1324    };
1325
1326    // The EmbeddedFiles name tree has a /Names array: [(name1, ref1), …]
1327    let names_arr_obj = match ef_dict.get(b"Names") {
1328        Ok(obj) => obj,
1329        Err(_) => return images,
1330    };
1331    let names_array = match names_arr_obj {
1332        Object::Array(arr) => arr,
1333        Object::Reference(id) => match doc.get_object(*id) {
1334            Ok(Object::Array(arr)) => arr,
1335            _ => return images,
1336        },
1337        _ => return images,
1338    };
1339
1340    // Process pairs: (name_string, value_ref)
1341    let mut i = 0;
1342    while i + 1 < names_array.len() {
1343        let name = match &names_array[i] {
1344            Object::String(bytes, _) => String::from_utf8_lossy(bytes).to_string(),
1345            _ => {
1346                i += 2;
1347                continue;
1348            }
1349        };
1350
1351        // The value can be:
1352        //   1. A FileSpec dict: /EF → /F → stream
1353        //   2. Directly a stream (non-standard but seen in XFA PDFs)
1354        let value_ref = &names_array[i + 1];
1355
1356        // Try path 1: FileSpec dict
1357        if let Some(filespec) = deref_dict(doc, value_ref) {
1358            if let Ok(ef_obj) = filespec.get(b"EF") {
1359                if let Some(ef) = deref_dict(doc, ef_obj) {
1360                    if let Ok(f_ref) = ef.get(b"F") {
1361                        if let Some(data) = extract_stream(doc, f_ref) {
1362                            images.insert(name.clone(), data);
1363                            i += 2;
1364                            continue;
1365                        }
1366                    }
1367                }
1368            }
1369        }
1370
1371        // Try path 2: Direct stream reference
1372        if let Some(data) = extract_stream(doc, value_ref) {
1373            images.insert(name.clone(), data);
1374        }
1375
1376        i += 2;
1377    }
1378    images
1379}
1380
1381// ---------------------------------------------------------------------------
1382// Font extraction, resolution, and embedding
1383// ---------------------------------------------------------------------------
1384
1385#[doc(hidden)]
1386pub fn extract_embedded_fonts(doc: &Document) -> Vec<EmbeddedFontData> {
1387    let mut fonts = Vec::new();
1388    let mut seen = std::collections::HashSet::new();
1389    for (&font_object_id, obj) in &doc.objects {
1390        let dict = match obj.as_dict() {
1391            Ok(d) => d,
1392            Err(_) => continue,
1393        };
1394        let is_font =
1395            dict.get(b"Type").ok().and_then(|o| o.as_name().ok()) == Some(b"Font".as_slice());
1396        if !is_font {
1397            continue;
1398        }
1399        let base_font = match dict.get(b"BaseFont").ok().and_then(|o| o.as_name().ok()) {
1400            Some(n) => String::from_utf8_lossy(n).to_string(),
1401            None => continue,
1402        };
1403
1404        let pdf_widths = extract_font_widths(dict);
1405        let pdf_encoding = extract_font_encoding(doc, dict);
1406        let pdf_source_font =
1407            extract_simple_pdf_source_font(doc, font_object_id, dict, pdf_widths.as_ref());
1408
1409        // First try direct FontDescriptor path (simple TrueType/OpenType fonts)
1410        if let Some((stream_id, data)) = extract_font_from_direct_fd(doc, dict, &base_font) {
1411            if seen.insert(stream_id) {
1412                store_font_data(
1413                    &mut fonts,
1414                    &base_font,
1415                    data,
1416                    pdf_widths.clone(),
1417                    pdf_encoding.clone(),
1418                    pdf_source_font,
1419                );
1420            }
1421            continue;
1422        }
1423
1424        // For CIDFont Type0: also check DescendantFonts path
1425        // CIDFont fonts store their font data in /DescendantFonts[n]/FontDescriptor/FontFile*
1426        // CID fonts use /W arrays (PDF spec §9.7.4.3) instead of simple /Widths.
1427        if let Some((stream_id, data)) = extract_cidfont_data(doc, dict, &base_font, &seen) {
1428            if seen.insert(stream_id) {
1429                let cid_widths = extract_cid_font_widths(doc, dict);
1430                store_font_data(&mut fonts, &base_font, data, cid_widths, None, None);
1431            }
1432            continue;
1433        }
1434
1435        if let Some(source_font) = pdf_source_font {
1436            // fix(#811): if the source PDF already exposes a reusable simple
1437            // font object with /Widths, keep that object alive through the XFA
1438            // pipeline. PDF 1.7 §5.5 defines those widths as the authoritative
1439            // simple-font metrics, and XFA 3.3 §11.7.1 relies on those metrics
1440            // for field fitting.
1441            store_font_data(
1442                &mut fonts,
1443                &base_font,
1444                Vec::new(),
1445                pdf_widths.clone(),
1446                pdf_encoding.clone(),
1447                Some(source_font),
1448            );
1449        }
1450    }
1451    fonts
1452}
1453
1454/// Extract /FirstChar, /LastChar, and /Widths from a font dictionary.
1455fn extract_font_widths(dict: &lopdf::Dictionary) -> Option<(u16, Vec<u16>)> {
1456    let first_char = dict.get(b"FirstChar").ok()?.as_i64().ok()? as u16;
1457    let _last_char = dict.get(b"LastChar").ok()?.as_i64().ok()? as u16;
1458    let widths_array = dict.get(b"Widths").ok()?.as_array().ok()?;
1459    let widths: Vec<u16> = widths_array
1460        .iter()
1461        .filter_map(|w| w.as_i64().ok().map(|v| v as u16))
1462        .collect();
1463    if widths.is_empty() {
1464        return None;
1465    }
1466    Some((first_char, widths))
1467}
1468
1469/// Extract CID font widths from a Type0 (composite) font's `/W` array.
1470///
1471/// CID fonts (PDF spec §9.7.4.3, Table 114) use a different width format than
1472/// simple fonts. Instead of `/FirstChar` + `/Widths`, they use a `/W` array in
1473/// the CIDFont descendant dictionary with two element types:
1474///
1475///   `cid_start [w1 w2 w3 ...]`   — consecutive CIDs starting at cid_start
1476///   `cid_first cid_last width`   — range of CIDs all sharing the same width
1477///
1478/// `/DW` (default width, defaults to 1000) applies to CIDs not listed in `/W`.
1479///
1480/// The result is converted to the same `(first_char, widths)` representation
1481/// used by simple fonts, where `widths[cid - first_char]` gives the width.
1482///
1483/// LIMITATION: CID-to-Unicode mapping via ToUnicode CMap is not parsed here;
1484/// the widths are indexed by raw CID values.
1485fn extract_cid_font_widths(
1486    doc: &Document,
1487    type0_dict: &lopdf::Dictionary,
1488) -> Option<(u16, Vec<u16>)> {
1489    let descendants = type0_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
1490    let desc_ref = descendants.first()?;
1491    let cid_dict = match desc_ref {
1492        Object::Reference(id) => doc.get_dictionary(*id).ok()?,
1493        Object::Dictionary(d) => d,
1494        _ => return None,
1495    };
1496
1497    let default_width = cid_dict
1498        .get(b"DW")
1499        .ok()
1500        .and_then(|o| o.as_i64().ok())
1501        .unwrap_or(1000) as u16;
1502
1503    let w_array = cid_dict.get(b"W").ok()?;
1504    let w_array = match resolve_object(doc, w_array) {
1505        Some(obj) => obj.as_array().ok()?,
1506        None => return None,
1507    };
1508
1509    if w_array.is_empty() {
1510        return None;
1511    }
1512
1513    // First pass: collect all (cid, width) pairs to find bounds.
1514    let mut entries: Vec<(u16, u16)> = Vec::new();
1515    let mut i = 0;
1516    while i < w_array.len() {
1517        let cid_start = match w_array[i].as_i64() {
1518            Ok(v) => v as u16,
1519            Err(_) => {
1520                i += 1;
1521                continue;
1522            }
1523        };
1524        i += 1;
1525        if i >= w_array.len() {
1526            break;
1527        }
1528
1529        // Next element: array → consecutive widths, integer → range end
1530        if let Ok(widths_arr) = w_array[i].as_array() {
1531            // Format: cid_start [w1 w2 w3 ...]
1532            for (j, w_obj) in widths_arr.iter().enumerate() {
1533                if let Ok(w) = w_obj.as_i64() {
1534                    entries.push((cid_start + j as u16, w as u16));
1535                }
1536            }
1537            i += 1;
1538        } else if let Ok(cid_last) = w_array[i].as_i64() {
1539            // Format: cid_first cid_last width
1540            i += 1;
1541            if i >= w_array.len() {
1542                break;
1543            }
1544            if let Ok(width) = w_array[i].as_i64() {
1545                let cid_last = cid_last as u16;
1546                for cid in cid_start..=cid_last {
1547                    entries.push((cid, width as u16));
1548                }
1549            }
1550            i += 1;
1551        } else {
1552            i += 1;
1553        }
1554    }
1555
1556    if entries.is_empty() {
1557        return None;
1558    }
1559
1560    let min_cid = entries.iter().map(|(c, _)| *c).min().unwrap();
1561    let max_cid = entries.iter().map(|(c, _)| *c).max().unwrap();
1562    let len = (max_cid - min_cid + 1) as usize;
1563    let mut widths = vec![default_width; len];
1564    for (cid, w) in &entries {
1565        widths[(*cid - min_cid) as usize] = *w;
1566    }
1567
1568    Some((min_cid, widths))
1569}
1570
1571/// Parse a simple-font `/Encoding` dictionary with `/Differences`.
1572///
1573/// WHY: Custom encodings via `/Differences` are essential for correct glyph
1574/// width mapping. Without this, widths are indexed against the wrong
1575/// characters and text wrapping breaks for fonts that deviate from WinAnsi.
1576///
1577/// SPEC: PDF spec §9.6.6.1 defines `/Differences` as an alternating array of
1578/// starting code integers and glyph names applied on top of a base encoding.
1579///
1580/// LIMITATION: CID fonts (`/Type0`) use CMaps and `/W` arrays instead of this
1581/// simple-font encoding mechanism, so they intentionally return `None` here.
1582fn extract_font_encoding(doc: &Document, dict: &lopdf::Dictionary) -> Option<PdfSimpleEncoding> {
1583    let encoding_obj = resolve_object(doc, dict.get(b"Encoding").ok()?)?;
1584    let encoding_dict = encoding_obj.as_dict().ok()?;
1585    let differences_array = resolve_object(doc, encoding_dict.get(b"Differences").ok()?)?
1586        .as_array()
1587        .ok()?;
1588
1589    let base_encoding = encoding_dict
1590        .get(b"BaseEncoding")
1591        .ok()
1592        .and_then(|obj| resolve_object(doc, obj))
1593        .and_then(|obj| obj.as_name().ok())
1594        .and_then(PdfBaseEncoding::from_pdf_name)
1595        .unwrap_or(PdfBaseEncoding::WinAnsi);
1596
1597    let mut differences = Vec::new();
1598    let mut current_code: Option<u8> = None;
1599    for item in differences_array {
1600        let item = resolve_object(doc, item)?;
1601        if let Ok(code) = item.as_i64() {
1602            current_code = u8::try_from(code).ok();
1603            continue;
1604        }
1605
1606        let Some(name) = item.as_name().ok() else {
1607            continue;
1608        };
1609        let Some(code) = current_code else {
1610            continue;
1611        };
1612        let Some(glyph_name) = std::str::from_utf8(name).ok() else {
1613            continue;
1614        };
1615        if let Some(unicode) = pdf_glyph_name_to_unicode(glyph_name) {
1616            differences.push((code, unicode));
1617        }
1618        current_code = code.checked_add(1);
1619    }
1620
1621    if differences.is_empty() {
1622        return None;
1623    }
1624
1625    Some(PdfSimpleEncoding {
1626        base_encoding,
1627        differences,
1628    })
1629}
1630
1631fn extract_simple_pdf_source_font(
1632    doc: &Document,
1633    font_object_id: ObjectId,
1634    dict: &lopdf::Dictionary,
1635    pdf_widths: Option<&(u16, Vec<u16>)>,
1636) -> Option<PdfSourceFont> {
1637    pdf_widths?;
1638
1639    let subtype = dict.get(b"Subtype").ok().and_then(|obj| obj.as_name().ok());
1640    if subtype == Some(b"Type0".as_slice()) {
1641        return None;
1642    }
1643
1644    // fix(#811): only reuse simple fonts whose emitted PDF text can stay on
1645    // the current WinAnsi path in render_bridge. Fonts with custom encodings
1646    // need a dedicated byte encoder first; otherwise we would preserve widths
1647    // but emit the wrong character codes.
1648    //
1649    // PDF 1.7 §5.5 defines simple-font widths in the font's encoding space.
1650    // LIMITATION: CID/Type0 fonts use /W arrays and CMaps instead; they are
1651    // intentionally excluded here.
1652    let encoding_obj = dict
1653        .get(b"Encoding")
1654        .ok()
1655        .and_then(|obj| resolve_object(doc, obj));
1656    match encoding_obj {
1657        Some(obj) if obj.as_name().ok() == Some(b"WinAnsiEncoding".as_slice()) => {}
1658        Some(obj) => {
1659            let base = obj
1660                .as_dict()
1661                .ok()
1662                .and_then(|enc| enc.get(b"BaseEncoding").ok())
1663                .and_then(|base| resolve_object(doc, base))
1664                .and_then(|base| base.as_name().ok());
1665            if base != Some(b"WinAnsiEncoding".as_slice()) {
1666                return None;
1667            }
1668            if obj
1669                .as_dict()
1670                .ok()
1671                .and_then(|enc| enc.get(b"Differences").ok())
1672                .is_some()
1673            {
1674                return None;
1675            }
1676        }
1677        None => return None,
1678    }
1679
1680    Some(PdfSourceFont {
1681        object_id: font_object_id,
1682    })
1683}
1684
1685fn resolve_object<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
1686    match obj {
1687        Object::Reference(id) => doc.get_object(*id).ok(),
1688        other => Some(other),
1689    }
1690}
1691
1692/// Extract font data from a direct FontDescriptor (FontFile2/3/1 in FontDescriptor).
1693fn extract_font_from_direct_fd(
1694    doc: &Document,
1695    font_dict: &lopdf::Dictionary,
1696    _base_font: &str,
1697) -> Option<(lopdf::ObjectId, Vec<u8>)> {
1698    let fd_id = font_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
1699    let fd = doc.get_dictionary(fd_id).ok()?;
1700
1701    let font_stream_id = fd
1702        .get(b"FontFile2")
1703        .or_else(|_| fd.get(b"FontFile3"))
1704        .or_else(|_| fd.get(b"FontFile"))
1705        .ok()?
1706        .as_reference()
1707        .ok()?;
1708
1709    let stream = doc
1710        .get_object(font_stream_id)
1711        .and_then(|o| o.as_stream())
1712        .ok()?;
1713
1714    let data = stream
1715        .get_plain_content()
1716        .unwrap_or_else(|_| stream.content.clone());
1717
1718    if data.is_empty() {
1719        return None;
1720    }
1721
1722    Some((font_stream_id, data))
1723}
1724
1725/// Extract font data from CIDFont Type0's DescendantFonts path.
1726///
1727/// CIDFont Type0 fonts have their font data in:
1728///   /DescendantFonts[n] /CIDFont /FontDescriptor /FontFile*
1729fn extract_cidfont_data(
1730    doc: &Document,
1731    font_dict: &lopdf::Dictionary,
1732    _base_font: &str,
1733    seen: &std::collections::HashSet<lopdf::ObjectId>,
1734) -> Option<(lopdf::ObjectId, Vec<u8>)> {
1735    // Check if this is a Type0 (composite) font by looking for DescendantFonts
1736    let descendants = font_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
1737
1738    // Iterate through descendant CIDFonts
1739    for desc_ref in descendants {
1740        let desc_id = desc_ref.as_reference().ok()?;
1741        let desc_dict = doc.get_dictionary(desc_id).ok()?;
1742
1743        // Check if this descendant is a CIDFont (has FontDescriptor)
1744        let fd_id = desc_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
1745        let fd = doc.get_dictionary(fd_id).ok()?;
1746
1747        // Try FontFile3 first (CFF font for CIDFontType0C), then FontFile2 (TrueType)
1748        let font_stream_id = fd
1749            .get(b"FontFile3")
1750            .or_else(|_| fd.get(b"FontFile2"))
1751            .or_else(|_| fd.get(b"FontFile"))
1752            .ok()?
1753            .as_reference()
1754            .ok()?;
1755
1756        if seen.contains(&font_stream_id) {
1757            continue;
1758        }
1759
1760        let stream = doc
1761            .get_object(font_stream_id)
1762            .and_then(|o| o.as_stream())
1763            .ok()?;
1764
1765        let data = stream
1766            .get_plain_content()
1767            .unwrap_or_else(|_| stream.content.clone());
1768
1769        if !data.is_empty() {
1770            return Some((font_stream_id, data));
1771        }
1772    }
1773    None
1774}
1775
1776/// Store font data under multiple names (PostScript name, family name, normalized name).
1777fn store_font_data(
1778    fonts: &mut Vec<EmbeddedFontData>,
1779    base_font: &str,
1780    data: Vec<u8>,
1781    pdf_widths: Option<(u16, Vec<u16>)>,
1782    pdf_encoding: Option<PdfSimpleEncoding>,
1783    pdf_source_font: Option<PdfSourceFont>,
1784) {
1785    let clean_name = if let Some(pos) = base_font.find('+') {
1786        base_font[pos + 1..].to_string()
1787    } else {
1788        base_font.to_string()
1789    };
1790    let allow_family_alias = family_alias_is_regular_face(&clean_name, &data);
1791
1792    // Store under the PostScript name (subset prefix already stripped)
1793    fonts.push(EmbeddedFontData {
1794        name: clean_name.clone(),
1795        data: data.clone(),
1796        pdf_widths: pdf_widths.clone(),
1797        pdf_encoding: pdf_encoding.clone(),
1798        pdf_source_font,
1799    });
1800
1801    // Store additional aliases from the font name table. The bare family name
1802    // (e.g. "Arial") is only attached to the regular face so a normal-weight
1803    // XFA request does not get hijacked by a bold/italic variant.
1804    if let Ok(face) = ttf_parser::Face::parse(&data, 0) {
1805        for name_record in face.names() {
1806            let allow_alias = match name_record.name_id {
1807                ttf_parser::name_id::FAMILY => allow_family_alias,
1808                ttf_parser::name_id::FULL_NAME | ttf_parser::name_id::POST_SCRIPT_NAME => true,
1809                _ => false,
1810            };
1811            if !allow_alias {
1812                continue;
1813            }
1814            if let Some(alias) = name_record.to_string() {
1815                if alias != clean_name {
1816                    fonts.push(EmbeddedFontData {
1817                        name: alias,
1818                        data: data.clone(),
1819                        pdf_widths: pdf_widths.clone(),
1820                        pdf_encoding: pdf_encoding.clone(),
1821                        pdf_source_font,
1822                    });
1823                }
1824            }
1825        }
1826    }
1827
1828    // Common PostScript-to-family normalization as fallback. As with the name
1829    // table family alias above, reserve the bare family name for the regular
1830    // face so `Arial` resolves to `ArialMT` rather than `Arial-BoldMT`.
1831    let normalized = ps_name_to_family(&clean_name);
1832    if allow_family_alias && normalized != clean_name {
1833        fonts.push(EmbeddedFontData {
1834            name: normalized,
1835            data,
1836            pdf_widths,
1837            pdf_encoding,
1838            pdf_source_font,
1839        });
1840    }
1841}
1842
1843fn family_alias_is_regular_face(clean_name: &str, data: &[u8]) -> bool {
1844    if let Ok(face) = ttf_parser::Face::parse(data, 0) {
1845        if face.is_bold() || face.is_italic() {
1846            return false;
1847        }
1848    }
1849
1850    let lower = clean_name.to_ascii_lowercase();
1851    !lower.contains("bold") && !lower.contains("italic") && !lower.contains("oblique")
1852}
1853
1854/// Convert a PostScript font name to its likely family name.
1855///
1856/// Examples: `ArialMT` → `Arial`, `TimesNewRomanPSMT` → `Times New Roman`,
1857/// `MyriadPro-Regular` → `Myriad Pro`.
1858fn ps_name_to_family(ps_name: &str) -> String {
1859    // Strip weight/style suffixes first
1860    let base = ps_name
1861        .strip_suffix("PSMT")
1862        .or_else(|| ps_name.strip_suffix("PS-BoldItalicMT"))
1863        .or_else(|| ps_name.strip_suffix("PS-BoldMT"))
1864        .or_else(|| ps_name.strip_suffix("PS-ItalicMT"))
1865        .or_else(|| ps_name.strip_suffix("-BoldItalicMT"))
1866        .or_else(|| ps_name.strip_suffix("-BoldMT"))
1867        .or_else(|| ps_name.strip_suffix("-ItalicMT"))
1868        .or_else(|| ps_name.strip_suffix("MT"))
1869        .or_else(|| ps_name.strip_suffix("-Regular"))
1870        .or_else(|| ps_name.strip_suffix("-Bold"))
1871        .or_else(|| ps_name.strip_suffix("-Italic"))
1872        .or_else(|| ps_name.strip_suffix("-BoldItalic"))
1873        .unwrap_or(ps_name);
1874    // Insert spaces before uppercase letters that follow a lowercase letter
1875    // e.g. "TimesNewRoman" → "Times New Roman", "MyriadPro" → "Myriad Pro"
1876    let mut result = String::with_capacity(base.len() + 4);
1877    for (i, ch) in base.chars().enumerate() {
1878        if i > 0 && ch.is_uppercase() {
1879            let prev = base.as_bytes()[i - 1] as char;
1880            if prev.is_lowercase() {
1881                result.push(' ');
1882            }
1883        }
1884        result.push(ch);
1885    }
1886    result
1887}
1888
1889/// Collected font specification from the XFA template.
1890struct TemplateFontEntry {
1891    typeface: String,
1892    weight: Option<String>,
1893    posture: Option<String>,
1894    generic_family: Option<String>,
1895}
1896
1897fn collect_template_font_entries(template_xml: &str) -> Vec<TemplateFontEntry> {
1898    let mut entries = Vec::new();
1899    let mut seen = std::collections::HashSet::new();
1900    if let Ok(xml_doc) = roxmltree::Document::parse(template_xml) {
1901        for node in xml_doc.descendants() {
1902            if node.tag_name().name() == "font" {
1903                if let Some(typeface) = node.attribute("typeface") {
1904                    let name = typeface.to_string();
1905                    let weight = node.attribute("weight").map(|s| s.to_string());
1906                    let posture = node.attribute("posture").map(|s| s.to_string());
1907                    let generic_family = node.attribute("genericFamily").map(|s| s.to_string());
1908                    let key = font_variant_key(&name, weight.as_deref(), posture.as_deref());
1909                    if !name.is_empty() && seen.insert(key.to_lowercase()) {
1910                        entries.push(TemplateFontEntry {
1911                            typeface: name,
1912                            weight,
1913                            posture,
1914                            generic_family,
1915                        });
1916                    }
1917                }
1918            }
1919        }
1920    }
1921    entries
1922}
1923
1924fn embed_font_in_pdf(doc: &mut Document, font: &ResolvedFont) -> ObjectId {
1925    let font_stream = Stream::new(
1926        dictionary! {
1927            "Length" => Object::Integer(font.data.len() as i64),
1928            "Length1" => Object::Integer(font.data.len() as i64)
1929        },
1930        font.data.clone(),
1931    );
1932    let font_file_id = doc.add_object(Object::Stream(font_stream));
1933
1934    let upem = font.units_per_em as f64;
1935    let scale = 1000.0 / upem.max(1.0);
1936    let ascent = (font.ascender as f64 * scale) as i64;
1937    let descent = (font.descender as f64 * scale) as i64;
1938    let cap_height = (ascent as f64 * 0.7) as i64;
1939    let base_name = font.name.replace(' ', "-");
1940
1941    let fd = dictionary! {
1942        "Type" => Object::Name(b"FontDescriptor".to_vec()),
1943        "FontName" => Object::Name(base_name.as_bytes().to_vec()),
1944        "Flags" => Object::Integer(32),
1945        "FontBBox" => Object::Array(vec![
1946            Object::Integer(0),
1947            Object::Integer(descent),
1948            Object::Integer(1000),
1949            Object::Integer(ascent),
1950        ]),
1951        "ItalicAngle" => Object::Integer(0),
1952        "Ascent" => Object::Integer(ascent),
1953        "Descent" => Object::Integer(descent),
1954        "CapHeight" => Object::Integer(cap_height),
1955        "StemV" => Object::Integer(80),
1956        "FontFile2" => Object::Reference(font_file_id)
1957    };
1958    let fd_id = doc.add_object(Object::Dictionary(fd));
1959
1960    // Build CID font data for Identity-H encoding.
1961    let cid_info = font.cid_font_info().unwrap_or(CidFontInfo {
1962        widths: vec![500],
1963        gid_to_unicode: vec![],
1964    });
1965
1966    // /W array: [ 0 [w0 w1 w2 ... wN] ]
1967    let widths_inner: Vec<Object> = cid_info
1968        .widths
1969        .iter()
1970        .map(|&w| Object::Integer(w as i64))
1971        .collect();
1972    let w_array = vec![Object::Integer(0), Object::Array(widths_inner)];
1973
1974    let cid_font = dictionary! {
1975        "Type" => Object::Name(b"Font".to_vec()),
1976        "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
1977        "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
1978        "CIDSystemInfo" => Object::Dictionary(dictionary! {
1979            "Registry" => Object::String(b"Adobe".to_vec(), StringFormat::Literal),
1980            "Ordering" => Object::String(b"Identity".to_vec(), StringFormat::Literal),
1981            "Supplement" => Object::Integer(0)
1982        }),
1983        "FontDescriptor" => Object::Reference(fd_id),
1984        "W" => Object::Array(w_array),
1985        "CIDToGIDMap" => Object::Name(b"Identity".to_vec())
1986    };
1987    let cid_font_id = doc.add_object(Object::Dictionary(cid_font));
1988
1989    // ToUnicode CMap for text extraction / copy-paste.
1990    let tounicode_data = generate_tounicode_cmap(&cid_info.gid_to_unicode);
1991    let tounicode_stream = Stream::new(
1992        dictionary! { "Length" => Object::Integer(tounicode_data.len() as i64) },
1993        tounicode_data,
1994    );
1995    let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1996
1997    // Type0 (composite) font with Identity-H encoding.
1998    let type0_font = dictionary! {
1999        "Type" => Object::Name(b"Font".to_vec()),
2000        "Subtype" => Object::Name(b"Type0".to_vec()),
2001        "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
2002        "Encoding" => Object::Name(b"Identity-H".to_vec()),
2003        "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
2004        "ToUnicode" => Object::Reference(tounicode_id)
2005    };
2006    doc.add_object(Object::Dictionary(type0_font))
2007}
2008
2009/// Generate a ToUnicode CMap stream mapping glyph IDs to Unicode codepoints.
2010fn generate_tounicode_cmap(gid_to_unicode: &[(u16, char)]) -> Vec<u8> {
2011    let mut cmap = String::with_capacity(gid_to_unicode.len() * 24 + 256);
2012    cmap.push_str("/CIDInit /ProcSet findresource begin\n");
2013    cmap.push_str("12 dict begin\n");
2014    cmap.push_str("begincmap\n");
2015    cmap.push_str("/CIDSystemInfo\n");
2016    cmap.push_str("<< /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def\n");
2017    cmap.push_str("/CMapName /Adobe-Identity-UCS def\n");
2018    cmap.push_str("/CMapType 2 def\n");
2019    cmap.push_str("1 begincodespacerange\n");
2020    cmap.push_str("<0000> <FFFF>\n");
2021    cmap.push_str("endcodespacerange\n");
2022    for chunk in gid_to_unicode.chunks(100) {
2023        let _ = writeln!(cmap, "{} beginbfchar", chunk.len());
2024        for &(gid, ch) in chunk {
2025            let _ = writeln!(cmap, "<{:04X}> <{:04X}>", gid, ch as u32);
2026        }
2027        cmap.push_str("endbfchar\n");
2028    }
2029    cmap.push_str("endcmap\n");
2030    cmap.push_str("CMapName currentdict /CMap defineresource pop\n");
2031    cmap.push_str("end\nend\n");
2032    cmap.into_bytes()
2033}
2034
2035/// Resolve all fonts referenced in the XFA template without embedding them.
2036///
2037/// Returns a map from variant key to `ResolvedFont`. The key encodes typeface,
2038/// weight, and posture so that "Arial bold" and "Arial regular" are resolved
2039/// separately. Called BEFORE layout so that resolved metrics can be injected
2040/// into the `FormTree`.
2041fn resolve_template_fonts(template_xml: &str, pdf_bytes: &[u8]) -> HashMap<String, ResolvedFont> {
2042    let mut resolved = HashMap::new();
2043    let entries = collect_template_font_entries(template_xml);
2044    if entries.is_empty() {
2045        return resolved;
2046    }
2047    let source_doc = match Document::load_mem(pdf_bytes) {
2048        Ok(d) => d,
2049        Err(_) => return resolved,
2050    };
2051    let embedded_fonts = extract_embedded_fonts(&source_doc);
2052    let mut resolver = XfaFontResolver::new(embedded_fonts);
2053    for entry in &entries {
2054        let spec = XfaFontSpec::from_xfa_attrs(
2055            &entry.typeface,
2056            entry.weight.as_deref(),
2057            entry.posture.as_deref(),
2058            None,
2059            entry.generic_family.as_deref(),
2060        );
2061        let key = font_variant_key(
2062            &entry.typeface,
2063            entry.weight.as_deref(),
2064            entry.posture.as_deref(),
2065        );
2066        match resolver.resolve(&spec) {
2067            Ok(font) => {
2068                resolved.insert(key, font);
2069            }
2070            Err(e) => {
2071                eprintln!("Font resolution failed for '{}': {}", entry.typeface, e);
2072            }
2073        }
2074    }
2075    resolved
2076}
2077
2078/// Inject resolved font metrics into the FormTree before layout.
2079///
2080/// For each node whose style metadata carries a `font_family`, looks up the
2081/// matching `ResolvedFont` (using the variant key that includes weight/posture)
2082/// and populates the `resolved_widths`, `resolved_upem`, `resolved_ascender`,
2083/// and `resolved_descender` fields on the node's `FontMetrics`.
2084/// This makes `measure_width()` and `line_height_pt()` in the layout engine use
2085/// actual font data instead of generic AFM tables.
2086fn inject_resolved_metrics(
2087    tree: &mut xfa_layout_engine::form::FormTree,
2088    resolved: &HashMap<String, ResolvedFont>,
2089) {
2090    for i in 0..tree.nodes.len() {
2091        let id = xfa_layout_engine::form::FormNodeId(i);
2092        let style = &tree.meta(id).style;
2093        let font_family = style.font_family.clone();
2094        let font_weight = style.font_weight.clone();
2095        let font_style = style.font_style.clone();
2096        if let Some(ref family) = font_family {
2097            // Try variant-specific key first, then fall back to base key.
2098            let variant_key =
2099                font_variant_key(family, font_weight.as_deref(), font_style.as_deref());
2100            let base_key = font_variant_key(family, None, None);
2101            let font = resolved
2102                .get(&variant_key)
2103                .or_else(|| resolved.get(&base_key));
2104            if let Some(font) = font {
2105                let (_first_char, widths) = font.pdf_glyph_widths();
2106                let node = tree.get_mut(id);
2107                node.font.resolved_widths = Some(widths);
2108                node.font.resolved_upem = Some(font.units_per_em);
2109                node.font.resolved_ascender = Some(font.ascender);
2110                node.font.resolved_descender = Some(font.descender);
2111            }
2112        }
2113    }
2114}
2115
2116/// Embed already-resolved fonts into the PDF document.
2117///
2118/// Called AFTER layout. Returns the font_map (typeface -> PDF resource name),
2119/// the font objects for page resources, and the metrics data for render_bridge.
2120fn simple_encoding_unicode_to_code_map(encoding: &PdfSimpleEncoding) -> HashMap<u16, u8> {
2121    let mut map = HashMap::new();
2122    for (code, unicode) in encoding.code_to_unicode_table().into_iter().enumerate() {
2123        if let Some(cp) = unicode {
2124            map.entry(cp).or_insert(code as u8);
2125        }
2126    }
2127    map
2128}
2129
2130fn add_text_chars_for_font(
2131    chars_by_font: &mut HashMap<String, HashSet<char>>,
2132    font_family: Option<&str>,
2133    font_weight: Option<&str>,
2134    font_style: Option<&str>,
2135    text: &str,
2136) {
2137    let Some(family) = font_family else {
2138        return;
2139    };
2140    if text.is_empty() {
2141        return;
2142    }
2143    let chars: Vec<char> = text.chars().filter(|c| !c.is_control()).collect();
2144    if chars.is_empty() {
2145        return;
2146    }
2147
2148    let variant = font_variant_key(family, font_weight, font_style);
2149    chars_by_font
2150        .entry(variant)
2151        .or_default()
2152        .extend(chars.iter().copied());
2153    chars_by_font
2154        .entry(family.to_string())
2155        .or_default()
2156        .extend(chars);
2157}
2158
2159fn add_text_chars_for_style(
2160    chars_by_font: &mut HashMap<String, HashSet<char>>,
2161    style: &FormNodeStyle,
2162    text: &str,
2163) {
2164    add_text_chars_for_font(
2165        chars_by_font,
2166        style.font_family.as_deref(),
2167        style.font_weight.as_deref(),
2168        style.font_style.as_deref(),
2169        text,
2170    );
2171}
2172
2173fn collect_used_chars_from_layout_node(
2174    node: &LayoutNode,
2175    chars_by_font: &mut HashMap<String, HashSet<char>>,
2176) {
2177    match &node.content {
2178        LayoutContent::Text(t) => add_text_chars_for_style(chars_by_font, &node.style, t),
2179        LayoutContent::Field { value, .. } => {
2180            add_text_chars_for_style(chars_by_font, &node.style, value)
2181        }
2182        LayoutContent::WrappedText { lines, .. } => {
2183            for line in lines {
2184                add_text_chars_for_style(chars_by_font, &node.style, line);
2185            }
2186        }
2187        LayoutContent::Draw(DrawContent::Text(t)) => {
2188            add_text_chars_for_style(chars_by_font, &node.style, t)
2189        }
2190        _ => {}
2191    }
2192
2193    if let Some(caption) = &node.style.caption_text {
2194        add_text_chars_for_style(chars_by_font, &node.style, caption);
2195    }
2196
2197    if let Some(spans) = &node.style.rich_text_spans {
2198        for span in spans {
2199            add_text_chars_for_font(
2200                chars_by_font,
2201                span.font_family
2202                    .as_deref()
2203                    .or(node.style.font_family.as_deref()),
2204                span.font_weight
2205                    .as_deref()
2206                    .or(node.style.font_weight.as_deref()),
2207                span.font_style
2208                    .as_deref()
2209                    .or(node.style.font_style.as_deref()),
2210                &span.text,
2211            );
2212        }
2213    }
2214
2215    for child in &node.children {
2216        collect_used_chars_from_layout_node(child, chars_by_font);
2217    }
2218}
2219
2220fn collect_used_chars_by_font(layout: &LayoutDom) -> HashMap<String, HashSet<char>> {
2221    let mut chars_by_font = HashMap::new();
2222    for page in &layout.pages {
2223        for node in &page.nodes {
2224            collect_used_chars_from_layout_node(node, &mut chars_by_font);
2225        }
2226    }
2227    chars_by_font
2228}
2229
2230fn simple_font_can_encode_char(font: &ResolvedFont, ch: char) -> bool {
2231    if ch.is_ascii() {
2232        return true;
2233    }
2234    if let Some(encoding) = &font.pdf_encoding {
2235        let Ok(cp) = u16::try_from(ch as u32) else {
2236            return false;
2237        };
2238        return encoding
2239            .code_to_unicode_table()
2240            .into_iter()
2241            .flatten()
2242            .any(|u| u == cp);
2243    }
2244    unicode_to_winansi(ch).is_some()
2245}
2246
2247fn variant_key_base_name(key: &str) -> Option<&str> {
2248    key.strip_suffix("_Bold_Italic")
2249        .or_else(|| key.strip_suffix("_Bold_Normal"))
2250        .or_else(|| key.strip_suffix("_Normal_Italic"))
2251        .or_else(|| key.strip_suffix("_Normal_Normal"))
2252}
2253
2254#[allow(clippy::type_complexity)]
2255fn embed_resolved_fonts(
2256    doc: &mut Document,
2257    resolved: &HashMap<String, ResolvedFont>,
2258    layout: &LayoutDom,
2259) -> (
2260    HashMap<String, String>,
2261    Vec<(String, ObjectId)>,
2262    HashMap<String, FontMetricsData>,
2263) {
2264    let mut font_map = HashMap::new();
2265    let mut font_objects = Vec::new();
2266    let mut metrics_data = HashMap::new();
2267    let used_chars_by_font = collect_used_chars_by_font(layout);
2268    for (idx, (name, font)) in resolved.iter().enumerate() {
2269        let resource_name = format!("XFA_F{}", idx);
2270        // fix(#811): once a simple source font survives resolution, keep using
2271        // the original PDF object instead of emitting a synthetic Type0/system
2272        // fallback. That keeps field-fit behaviour aligned with the source PDF
2273        // and Acrobat's interpretation of the same /Widths table.
2274        //
2275        // WHY: custom encodings and non-ASCII content can require Unicode
2276        // shaping through Identity-H. If a simple source font cannot encode
2277        // the actual text in layout output, reusing it would produce '?'
2278        // substitutions in content streams.
2279        //
2280        // LIMITATION: CID source fonts (/Type0 with /W arrays) use a different
2281        // mechanism and are not covered by this simple-font encodeability gate.
2282        let used_chars = used_chars_by_font
2283            .get(name)
2284            .or_else(|| used_chars_by_font.get(&font.name))
2285            .or_else(|| variant_key_base_name(name).and_then(|base| used_chars_by_font.get(base)));
2286        let source_can_encode_all_text = used_chars.is_none_or(|chars| {
2287            chars
2288                .iter()
2289                .all(|ch| simple_font_can_encode_char(font, *ch))
2290        });
2291        let (obj_id, render_font_data) = if let Some(source_font) = font.pdf_source_font {
2292            if source_can_encode_all_text || font.data.is_empty() {
2293                (source_font.object_id, None)
2294            } else {
2295                (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2296            }
2297        } else {
2298            (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2299        };
2300        font_map.insert(name.clone(), format!("/{}", resource_name));
2301        font_objects.push((resource_name, obj_id));
2302        let (_first_char, widths) = font.pdf_glyph_widths();
2303        metrics_data.insert(
2304            name.clone(),
2305            FontMetricsData {
2306                widths,
2307                upem: font.units_per_em,
2308                ascender: font.ascender,
2309                descender: font.descender,
2310                font_data: render_font_data,
2311                face_index: font.face_index,
2312                simple_unicode_to_code: font
2313                    .pdf_encoding
2314                    .as_ref()
2315                    .map(simple_encoding_unicode_to_code_map),
2316            },
2317        );
2318    }
2319    (font_map, font_objects, metrics_data)
2320}
2321
2322/// Fallback: preserve existing page content, strip AcroForm/widgets only.
2323/// If lopdf can't parse the PDF (corrupt xref), return the original bytes
2324/// unchanged — the PDF is too corrupt for us to modify but still renderable.
2325///
2326/// This function ALWAYS returns Ok — errors are logged but the original bytes
2327/// are always returned as a last resort.
2328fn static_fallback(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
2329    let mut doc = match Document::load_mem(pdf_bytes) {
2330        Ok(d) => d,
2331        Err(e) => {
2332            eprintln!("static_fallback: lopdf load failed ({e}), returning original bytes");
2333            return Ok(pdf_bytes.to_vec());
2334        }
2335    };
2336    strip_widgets_and_acroform(&mut doc);
2337    javascript_policy::strip_javascript_for_flatten(&mut doc);
2338    let mut out = Vec::new();
2339    if let Err(e) = doc.save_to(&mut out) {
2340        eprintln!("static_fallback: save failed ({e}), returning original bytes");
2341        return Ok(pdf_bytes.to_vec());
2342    }
2343    Ok(out)
2344}
2345
2346/// Count how many page-area slots the pre-saved form DOM uses.
2347///
2348/// When the PDF was last saved by Adobe Reader after JS execution, the form DOM
2349/// records one `<pageArea` element per physical page.  This count tells us the
2350/// correct rendered page count and is used to cap suppression so we never drop
2351/// more pages than Adobe's runtime would have.
2352///
2353/// Returns `None` when the form DOM is absent or has no page-area elements.
2354fn form_dom_page_count(form_xml: &str) -> Option<usize> {
2355    let count = form_xml.matches("<pageArea").count();
2356    if count > 0 {
2357        Some(count)
2358    } else {
2359        None
2360    }
2361}
2362
2363/// Apply presence overrides and repeating-instance expansion from the XFA form
2364/// DOM packet.
2365///
2366/// When an XFA PDF has been opened and saved by Adobe Reader, the form DOM
2367/// captures the runtime state of all nodes after scripts executed.  We walk
2368/// the form DOM and the FormTree in parallel (matching by subform/field name)
2369/// to:
2370///
2371/// 1. Transfer `presence="hidden"` attributes that our script interpreter
2372///    could not compute (e.g. Avoka framework's `sfcUtils.updateVisibility`).
2373/// 2. Replicate repeating subform instances (XFA §4.4.3): when `bind
2374///    match="none"` prevents data-driven expansion, the form DOM records the
2375///    correct instance count produced by the runtime's `instanceManager`.  We
2376///    deep-clone the template instance and populate field values from the form
2377///    DOM so the layout engine produces the right number of pages.
2378fn apply_form_dom_presence(tree: &mut FormTree, root_id: FormNodeId, form_xml: &str) {
2379    use xfa_layout_engine::form::{FormNodeType, Presence};
2380
2381    let Ok(doc) = roxmltree::Document::parse(form_xml) else {
2382        return;
2383    };
2384
2385    /// Deep-clone a subtree rooted at `src_id`, returning the new root id.
2386    fn clone_subtree(tree: &mut FormTree, src_id: FormNodeId) -> FormNodeId {
2387        let node = tree.get(src_id).clone();
2388        let meta = tree.meta(src_id).clone();
2389        // Temporarily take children out to avoid borrow issues
2390        let child_ids: Vec<FormNodeId> = node.children.clone();
2391        let mut new_node = node;
2392        new_node.children = Vec::new();
2393        // Clear xfa_id to avoid duplicate id conflicts
2394        let mut new_meta = meta;
2395        new_meta.xfa_id = None;
2396        let new_id = tree.add_node_with_meta(new_node, new_meta);
2397        // Recursively clone children
2398        for &child_id in &child_ids {
2399            let cloned_child = clone_subtree(tree, child_id);
2400            tree.get_mut(new_id).children.push(cloned_child);
2401        }
2402        new_id
2403    }
2404
2405    /// Reset every `Field` leaf in a subtree to `value = ""`.
2406    ///
2407    /// Used immediately after `clone_subtree` produces a fresh
2408    /// form-DOM-driven clone. The form DOM is the canonical source
2409    /// for the runtime values of *its* declared instances (XFA §3.1).
2410    /// The cloned subtree initially carries the template instance's
2411    /// bound value (e.g. the lone `<xfa:datasets>` Row's value); if
2412    /// we leave it, the form-DOM apply path's `value.is_empty()`
2413    /// override guard silently no-ops on every cloned field. Clearing
2414    /// here lets the existing `apply_recursive` write per-clone values
2415    /// from the form-DOM `<value><text>...</text></value>` records.
2416    ///
2417    /// Only `Field` payloads are touched; `Draw` / `Subform` /
2418    /// `PageArea` / structural nodes are walked but unchanged. Names,
2419    /// box-models, presence, occur, and all other metadata are
2420    /// preserved from the clone.
2421    fn clear_field_values_in_subtree(tree: &mut FormTree, root_id: FormNodeId) {
2422        let child_ids: Vec<FormNodeId> = tree.get(root_id).children.clone();
2423        if let FormNodeType::Field { .. } = tree.get(root_id).node_type {
2424            tree.get_mut(root_id).node_type = FormNodeType::Field {
2425                value: String::new(),
2426            };
2427        }
2428        for cid in child_ids {
2429            clear_field_values_in_subtree(tree, cid);
2430        }
2431    }
2432
2433    /// Extract the text content of the first `<value>` child's inner element
2434    /// (e.g. `<value><text>hello</text></value>` → `"hello"`).
2435    fn extract_field_value(xml_field: roxmltree::Node<'_, '_>) -> Option<String> {
2436        let value_el = xml_field
2437            .children()
2438            .find(|c| c.is_element() && c.tag_name().name() == "value")?;
2439        // The inner element may be <text>, <date>, <time>, <float>, etc.
2440        let inner = value_el.children().find(|c| c.is_element())?;
2441        inner.text().map(|t| t.to_string())
2442    }
2443
2444    /// Apply presence, values, and child expansion from the form DOM to a
2445    /// FormTree node.
2446    fn apply_recursive(
2447        tree: &mut FormTree,
2448        form_node_id: FormNodeId,
2449        xml_node: roxmltree::Node<'_, '_>,
2450    ) {
2451        let xml_tag = xml_node.tag_name().name();
2452        if xml_tag != "subform" && xml_tag != "field" && xml_tag != "form" {
2453            return;
2454        }
2455
2456        // Apply presence override.
2457        if xml_tag == "subform" || xml_tag == "field" {
2458            if let Some(pres) = xml_node.attribute("presence") {
2459                if pres == "hidden" {
2460                    tree.meta_mut(form_node_id).presence = Presence::Hidden;
2461                    // UX1: trace the form-DOM-driven presence override. This
2462                    // is the production wiring of the presence emitter
2463                    // helper; the engine emits one event per applied
2464                    // override.
2465                    let som = tree.get(form_node_id).name.clone();
2466                    trace_sites::presence(
2467                        &som,
2468                        TraceReason::PresenceHidden,
2469                        "form_dom_presence_hidden",
2470                    );
2471                }
2472            }
2473        }
2474
2475        // Transfer field value from the form DOM when the FormTree node has no
2476        // value yet (empty string) or the form DOM has a computed value.
2477        if xml_tag == "field" {
2478            if let Some(val) = extract_field_value(xml_node) {
2479                if let FormNodeType::Field { ref value, .. } = tree.get(form_node_id).node_type {
2480                    if value.is_empty() {
2481                        tree.get_mut(form_node_id).node_type = FormNodeType::Field { value: val };
2482                    }
2483                }
2484            }
2485            return; // fields have no structural children to recurse into
2486        }
2487
2488        // Collect XML children (subforms and fields), skipping instanceManagers.
2489        let xml_children: Vec<roxmltree::Node<'_, '_>> = xml_node
2490            .children()
2491            .filter(|c| {
2492                c.is_element()
2493                    && (c.tag_name().name() == "subform"
2494                        || c.tag_name().name() == "field"
2495                        || c.tag_name().name() == "draw")
2496            })
2497            .collect();
2498
2499        // Group consecutive XML children by name to detect repeating instances.
2500        // E.g., [Activity, Activity, Activity, Activity] → ("Activity", 4)
2501        let mut xml_groups: Vec<(&str, Vec<roxmltree::Node<'_, '_>>)> = Vec::new();
2502        for &xc in &xml_children {
2503            let xname = xc.attribute("name").unwrap_or("");
2504            if let Some(last) = xml_groups.last_mut() {
2505                if last.0 == xname {
2506                    last.1.push(xc);
2507                    continue;
2508                }
2509            }
2510            xml_groups.push((xname, vec![xc]));
2511        }
2512
2513        // For each group, match against FormTree children, cloning when needed.
2514        let mut form_children = tree.get(form_node_id).children.clone();
2515        let mut used = vec![false; form_children.len()];
2516
2517        for (gname, group_xml_nodes) in &xml_groups {
2518            let xml_count = group_xml_nodes.len();
2519
2520            // Count existing FormTree children with this name
2521            let existing: Vec<(usize, FormNodeId)> = form_children
2522                .iter()
2523                .enumerate()
2524                .filter(|(i, &fid)| !used[*i] && tree.get(fid).name == *gname)
2525                .map(|(i, &fid)| (i, fid))
2526                .collect();
2527            let existing_count = existing.len();
2528
2529            // M5.3d: form-DOM-driven replication is now the executable
2530            // `form_dom_driven_repeat_instance_replication` rule. The
2531            // rule decides how many clones to add (matches the prior
2532            // inline `if xml_count > existing_count && existing_count > 0
2533            // { … xml_count - existing_count }`) and emits the trace
2534            // anchor when it adds clones. Caller still owns the mutation.
2535            let replication = crate::adobe_compat::form_dom_driven_repeat_instance_replication(
2536                gname,
2537                xml_count,
2538                existing_count,
2539            );
2540            if replication.clones_to_add > 0 {
2541                let template_id = existing[0].1;
2542                // Find insertion position: after the last existing sibling
2543                let last_existing_idx = existing.last().unwrap().0;
2544                let insert_pos = last_existing_idx + 1;
2545                let clones_needed = replication.clones_to_add;
2546                // When the form DOM declares more instances of this
2547                // group than the template+datasets expansion produced,
2548                // the form DOM is the runtime source-of-truth for the
2549                // *entire* group (XFA §3.1). Clear field values on the
2550                // existing template instances too so the form-DOM apply
2551                // walk's `is_empty()` override guard fires for each
2552                // matched record. Without this the first existing
2553                // template instance keeps its `<xfa:datasets>`-bound
2554                // value while the new clones get form-DOM values,
2555                // producing a mixed-source output.
2556                for (_idx, fid) in &existing {
2557                    clear_field_values_in_subtree(tree, *fid);
2558                }
2559                let mut new_ids = Vec::new();
2560                for _ in 0..clones_needed {
2561                    let cloned = clone_subtree(tree, template_id);
2562                    // Same clearing rationale for the freshly-cloned
2563                    // subtree: the clone inherits the template's bound
2564                    // value, but it must take its value from the
2565                    // corresponding form-DOM record.
2566                    clear_field_values_in_subtree(tree, cloned);
2567                    new_ids.push(cloned);
2568                }
2569                // Insert cloned nodes into the parent's children list
2570                for (offset, new_id) in new_ids.iter().enumerate() {
2571                    form_children.insert(insert_pos + offset, *new_id);
2572                    used.insert(insert_pos + offset, false);
2573                }
2574                // Persist the updated children list
2575                tree.get_mut(form_node_id).children = form_children.clone();
2576                // The rule's executable function already emitted the
2577                // (occur, subform_materialised_from_data) trace anchor
2578                // with the final instance count. The prior UX1 inline
2579                // emit at this site is now redundant and has been
2580                // removed; the rule is the single emit point.
2581            }
2582
2583            // Now match each XML node in the group to a FormTree child
2584            for (group_idx, &xc) in group_xml_nodes.iter().enumerate() {
2585                // Find next unmatched FormTree child with this name
2586                let matched = form_children
2587                    .iter()
2588                    .enumerate()
2589                    .skip(if group_idx > 0 {
2590                        // Start searching after the last matched position
2591                        form_children
2592                            .iter()
2593                            .enumerate()
2594                            .rfind(|(i, &fid)| used[*i] && tree.get(fid).name == *gname)
2595                            .map(|(i, _)| i + 1)
2596                            .unwrap_or(0)
2597                    } else {
2598                        0
2599                    })
2600                    .find(|(i, &fid)| !used[*i] && tree.get(fid).name == *gname);
2601                if let Some((idx, &fid)) = matched {
2602                    used[idx] = true;
2603                    apply_recursive(tree, fid, xc);
2604                }
2605            }
2606        }
2607
2608        // XFA §3.1: the form DOM represents the runtime-instantiated form.
2609        // Named template subforms NOT present in the form DOM were never
2610        // instantiated by Adobe's runtime (e.g. script-driven conditional
2611        // sections).  Hide them to prevent over-pagination from phantom
2612        // page-level subforms.
2613        //
2614        // Only suppress when the form DOM explicitly lists subform children;
2615        // a sparse form DOM with no structural children means it didn't
2616        // record child state and we should not infer absence.
2617        let has_subform_children = xml_children
2618            .iter()
2619            .any(|c| c.tag_name().name() == "subform");
2620        if has_subform_children {
2621            for (i, &fid) in form_children.iter().enumerate() {
2622                if used[i] {
2623                    continue;
2624                }
2625                let child_node = tree.get(fid);
2626                // Only suppress named subforms — skip pageSet, unnamed
2627                // transparent nodes, draws, fields, and structural elements.
2628                if matches!(child_node.node_type, FormNodeType::Subform)
2629                    && !child_node.name.is_empty()
2630                {
2631                    let som = child_node.name.clone();
2632                    tree.meta_mut(fid).presence = Presence::Hidden;
2633                    // UX1: trace the form-DOM-absence-driven presence
2634                    // suppression. The subform is named in the template but
2635                    // absent from the saved form DOM — the runtime hid it
2636                    // and we mirror that decision.
2637                    trace_sites::presence(
2638                        &som,
2639                        TraceReason::PresenceHidden,
2640                        "form_dom_absent_subform_hidden",
2641                    );
2642                }
2643            }
2644        }
2645    }
2646
2647    // The form DOM root is <form><subform name="...">...</subform></form>
2648    let form_root = doc.root_element();
2649    let form_root_subform = form_root
2650        .children()
2651        .find(|c| c.is_element() && c.tag_name().name() == "subform");
2652
2653    if let Some(xml_root_sf) = form_root_subform {
2654        let root_children = tree.get(root_id).children.clone();
2655        let root_name = xml_root_sf.attribute("name").unwrap_or("");
2656        for &child_id in &root_children {
2657            if tree.get(child_id).name == root_name {
2658                apply_recursive(tree, child_id, xml_root_sf);
2659                break;
2660            }
2661        }
2662    }
2663}
2664
2665/// Tiny PDFs (<1KB) with XFA templates that lack essential elements (subform,
2666/// pageSet) are corrupt stubs. Attempting to flatten these produces blank pages
2667/// instead of preserving the original page content.
2668fn is_corrupt_xfa_template(pdf_size: usize, template_xml: &str) -> bool {
2669    // Only apply to small PDFs — larger files may have legitimate sparse templates.
2670    if pdf_size >= 1024 {
2671        return false;
2672    }
2673    // A valid XFA template must parse and contain at least one subform or pageSet.
2674    match roxmltree::Document::parse(template_xml) {
2675        Ok(doc) => {
2676            let root = doc.root_element();
2677            !root.children().any(|c| {
2678                c.is_element()
2679                    && matches!(c.tag_name().name(), "subform" | "pageSet" | "subformSet")
2680            })
2681        }
2682        Err(_) => true, // Unparseable template is corrupt.
2683    }
2684}
2685
2686/// Strip undefined XML entity references from XFA template/datasets XML.
2687///
2688/// `roxmltree` only supports the five predefined XML entities (lt, gt, amp,
2689/// quot, apos). Some XFA PDFs contain custom entity references like `&xxe;`
2690/// that cause parse failures, so we drop only those references.
2691///
2692/// fixes #812: Adobe-generated XFA packets also contain raw `&` inside
2693/// processing instructions such as `<?renderCache.subset ... "#$%&'()+"?>`
2694/// and `<?renderCache.textRun ... "A. Adjustment & Location" ...?>`.
2695/// Those packets are valid XML because PI payload is opaque text. The old
2696/// implementation deleted everything between `&` and the next `;`, which
2697/// corrupted valid templates before merge and forced the flattener down the
2698/// 1-page static fallback path.
2699///
2700/// XFA Spec 3.3 §8.6 / §8.8 rely on the template reaching the merge/layout
2701/// pipeline intact. CID `/W` handling is unrelated and remains out of scope.
2702fn strip_undefined_xml_entities(xml: &str) -> String {
2703    let predefined = ["lt", "gt", "amp", "quot", "apos"];
2704    let mut result = String::with_capacity(xml.len());
2705    let bytes = xml.as_bytes();
2706    let mut pos = 0;
2707
2708    while let Some(rel_amp_pos) = xml[pos..].find('&') {
2709        let amp_pos = pos + rel_amp_pos;
2710        result.push_str(&xml[pos..amp_pos]);
2711
2712        if let Some((entity_name, next_pos)) = parse_xml_entity_reference(xml, amp_pos) {
2713            // Keep numeric character references (&#123; or &#x1F;) and the
2714            // predefined XML entities. Drop only true named entity references
2715            // that roxmltree cannot resolve.
2716            if entity_name.starts_with('#') || predefined.contains(&entity_name) {
2717                result.push_str(&xml[amp_pos..next_pos]);
2718            }
2719            pos = next_pos;
2720        } else {
2721            // Not an XML entity reference; preserve the raw ampersand.
2722            result.push('&');
2723            pos = amp_pos + 1;
2724        }
2725    }
2726
2727    if pos < bytes.len() {
2728        result.push_str(&xml[pos..]);
2729    }
2730    result
2731}
2732
2733fn parse_xml_entity_reference(xml: &str, amp_pos: usize) -> Option<(&str, usize)> {
2734    let bytes = xml.as_bytes();
2735    let start = amp_pos + 1;
2736    let first = *bytes.get(start)?;
2737
2738    // Numeric character references: &#123; or &#x1F;
2739    if first == b'#' {
2740        let mut idx = start + 1;
2741        if matches!(bytes.get(idx), Some(b'x' | b'X')) {
2742            idx += 1;
2743            let hex_start = idx;
2744            while matches!(
2745                bytes.get(idx),
2746                Some(b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')
2747            ) {
2748                idx += 1;
2749            }
2750            if idx == hex_start || !matches!(bytes.get(idx), Some(b';')) {
2751                return None;
2752            }
2753        } else {
2754            let digits_start = idx;
2755            while matches!(bytes.get(idx), Some(b'0'..=b'9')) {
2756                idx += 1;
2757            }
2758            if idx == digits_start || !matches!(bytes.get(idx), Some(b';')) {
2759                return None;
2760            }
2761        }
2762        return Some((&xml[start..idx], idx + 1));
2763    }
2764
2765    // Named references: &name; where `name` follows XML Name syntax enough to
2766    // distinguish it from raw PI/script/text ampersands.
2767    if !is_xml_name_start(first) {
2768        return None;
2769    }
2770
2771    let mut idx = start + 1;
2772    while let Some(&b) = bytes.get(idx) {
2773        if b == b';' {
2774            return Some((&xml[start..idx], idx + 1));
2775        }
2776        if !is_xml_name_char(b) {
2777            return None;
2778        }
2779        idx += 1;
2780    }
2781    None
2782}
2783
2784fn is_xml_name_start(byte: u8) -> bool {
2785    matches!(byte, b':' | b'_' | b'A'..=b'Z' | b'a'..=b'z')
2786}
2787
2788fn is_xml_name_char(byte: u8) -> bool {
2789    is_xml_name_start(byte) || matches!(byte, b'-' | b'.' | b'0'..=b'9')
2790}
2791
2792// ---------------------------------------------------------------------------
2793// Helpers
2794// ---------------------------------------------------------------------------
2795
2796/// Returns `true` when the PDF's pages already carry substantial static content.
2797///
2798/// An array /Contents entry (multiple streams) or any individual stream larger
2799/// than 200 bytes indicates pre-flattened page content that should be preserved
2800/// rather than replaced by XFA re-rendering. Adobe's default XFA fallback
2801/// page ("Please wait..." / Adobe Reader upgrade text) is explicitly ignored:
2802/// those bytes are not real pre-rendered form content and must not suppress
2803/// XFA flattening.
2804fn pages_have_static_content(doc: &Document) -> bool {
2805    for page_id in doc.page_iter() {
2806        let streams = page_content_streams(doc, page_id);
2807        if streams.is_empty() {
2808            continue;
2809        }
2810
2811        // Count text-drawing operators (Tj/TJ) across all non-placeholder
2812        // content streams for this page. A real pre-rendered form page has
2813        // dozens of text operators; a watermark or evaluation overlay has
2814        // only 1–3. We require ≥5 non-placeholder text operators to
2815        // consider the page as having substantial static content.
2816        let mut text_op_count = 0usize;
2817        for stream in &streams {
2818            if is_xfa_placeholder_stream(stream) || is_watermark_stream(stream) {
2819                continue;
2820            }
2821            text_op_count += count_text_operators(stream);
2822        }
2823
2824        if text_op_count >= 5 {
2825            return true;
2826        }
2827    }
2828    false
2829}
2830
2831fn page_content_streams(doc: &Document, page_id: ObjectId) -> Vec<Vec<u8>> {
2832    let Ok(page_dict) = doc.get_dictionary(page_id) else {
2833        return Vec::new();
2834    };
2835
2836    match page_dict.get(b"Contents") {
2837        Ok(Object::Array(arr)) => arr
2838            .iter()
2839            .filter_map(|object| resolve_stream_content(doc, object))
2840            .collect(),
2841        Ok(Object::Reference(id)) => match doc.get_object(*id) {
2842            Ok(Object::Array(arr)) => arr
2843                .iter()
2844                .filter_map(|object| resolve_stream_content(doc, object))
2845                .collect(),
2846            Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
2847            Err(_) => Vec::new(),
2848        },
2849        Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
2850        Err(_) => Vec::new(),
2851    }
2852}
2853
2854fn resolve_stream_content(doc: &Document, object: &Object) -> Option<Vec<u8>> {
2855    let stream = match object {
2856        Object::Reference(id) => doc.get_object(*id).ok()?.as_stream().ok()?,
2857        Object::Stream(stream) => stream,
2858        _ => return None,
2859    };
2860
2861    stream
2862        .get_plain_content()
2863        .ok()
2864        .or_else(|| Some(stream.content.clone()))
2865}
2866
2867/// Count text-drawing operators (Tj / TJ) in a content stream.
2868fn count_text_operators(stream: &[u8]) -> usize {
2869    let mut count = 0;
2870    for window in stream.windows(3) {
2871        if (window[0] == b' ' || window[0] == b')' || window[0] == b']')
2872            && window[1] == b'T'
2873            && (window[2] == b'j' || window[2] == b'J')
2874        {
2875            count += 1;
2876        }
2877    }
2878    count
2879}
2880
2881/// Bake checkbox/radio button appearance marks from AcroForm widget AP streams
2882/// onto existing page content for dynamic XFA forms.
2883///
2884/// Hybrid XFA PDFs carry pre-rendered appearance streams in their widget `/AP/N`
2885/// dictionaries. For radio/checkbox widgets the Normal appearance dict often has
2886/// only the "on" state (filled circle / checkmark) with no "Off" entry. Only
2887/// widgets that are currently asserted should contribute that mark to the
2888/// flattened page; widgets explicitly in the `Off` state must not be stamped
2889/// with the on-mark just because `/AP/N` lacks an `Off` appearance.
2890fn bake_checkbox_radio_ap_marks(doc: &mut Document, page_id: ObjectId) -> usize {
2891    let annots = page_annotations(doc, page_id);
2892    if annots.is_empty() {
2893        return 0;
2894    }
2895
2896    let mut baked = 0usize;
2897    let mut overlay_ops = Vec::new();
2898
2899    for annot in &annots {
2900        let Some(annot_id) = annot.as_reference().ok() else {
2901            continue;
2902        };
2903        let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
2904            continue;
2905        };
2906
2907        let is_widget = annot_dict
2908            .get(b"Subtype")
2909            .ok()
2910            .and_then(|obj| obj.as_name().ok())
2911            == Some(&b"Widget"[..]);
2912        if !is_widget {
2913            continue;
2914        }
2915
2916        // Radio/checkbox widgets have a dictionary of named states in /AP/N
2917        // (e.g. /N << /0 35 0 R >>).  Text fields and pushbuttons have /AP/N
2918        // as a single stream reference.  Use this to filter.
2919        let ap = match annot_dict.get(b"AP").ok().and_then(|o| o.as_dict().ok()) {
2920            Some(ap) => ap.clone(),
2921            None => continue,
2922        };
2923        let normal_obj = match ap.get(b"N").ok() {
2924            Some(obj) => obj.clone(),
2925            None => continue,
2926        };
2927
2928        // Resolve /N to a dictionary of appearance states.
2929        let states: Dictionary = match &normal_obj {
2930            Object::Reference(id) => match doc.get_object(*id).ok().cloned() {
2931                Some(Object::Dictionary(d)) => d,
2932                _ => continue, // direct stream → not radio/checkbox
2933            },
2934            Object::Dictionary(d) => d.clone(),
2935            _ => continue,
2936        };
2937
2938        if matches!(selected_widget_state(&annot_dict), Some(state) if state == b"Off") {
2939            continue;
2940        }
2941
2942        // Find the first non-"Off" state (the "on" mark appearance).
2943        let on_id = states
2944            .iter()
2945            .filter(|(name, _)| name.as_slice() != b"Off")
2946            .find_map(|(_, obj)| match obj {
2947                Object::Reference(id) => Some(*id),
2948                _ => None,
2949            });
2950        let Some(ap_id) = on_id else { continue };
2951
2952        // Verify the referenced object is a Form XObject stream.
2953        match doc.get_object(ap_id).ok() {
2954            Some(Object::Stream(_)) => {}
2955            _ => continue,
2956        }
2957
2958        let Some(rect) = annotation_rect(&annot_dict) else {
2959            continue;
2960        };
2961
2962        let xobject_name = format!("XfaCbAp{}", baked);
2963        add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
2964        write_ops(
2965            &mut overlay_ops,
2966            format_args!(
2967                "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
2968                rect[0], rect[1], xobject_name
2969            ),
2970        );
2971        baked += 1;
2972    }
2973
2974    if !overlay_ops.is_empty() {
2975        append_to_page_content(doc, page_id, &overlay_ops);
2976    }
2977
2978    baked
2979}
2980
2981fn is_xfa_placeholder_stream(stream: &[u8]) -> bool {
2982    const PLACEHOLDER_MARKERS: [&[u8]; 5] = [
2983        b"Please wait",
2984        b"Adobe Reader",
2985        b"reader_download",
2986        b"display this type of document",
2987        b"To view the full contents",
2988    ];
2989
2990    PLACEHOLDER_MARKERS
2991        .iter()
2992        .any(|marker| contains_ascii_case_insensitive(stream, marker))
2993}
2994
2995/// Detect evaluation-software watermark overlays (e.g. "Qoppa Software",
2996/// "For Evaluation Only"). These are short streams with ≤3 Tj operators
2997/// that should not count as real pre-rendered form content.
2998fn is_watermark_stream(stream: &[u8]) -> bool {
2999    const WATERMARK_MARKERS: [&[u8]; 3] =
3000        [b"Evaluation Only", b"Qoppa Software", b"For Evaluation"];
3001    WATERMARK_MARKERS
3002        .iter()
3003        .any(|marker| contains_ascii_case_insensitive(stream, marker))
3004}
3005
3006fn contains_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
3007    haystack
3008        .windows(needle.len())
3009        .any(|window| window.eq_ignore_ascii_case(needle))
3010}
3011
3012fn write_ops(buf: &mut Vec<u8>, args: std::fmt::Arguments<'_>) {
3013    use std::fmt::Write as _;
3014
3015    let mut text = String::new();
3016    let _ = text.write_fmt(args);
3017    buf.extend_from_slice(text.as_bytes());
3018}
3019
3020/// Flatten Widget annotation appearances onto their pages.
3021///
3022/// Hybrid XFA PDFs often already contain the correct visual representation in
3023/// widget `/AP` streams. Stripping those widgets outright drops borders, text,
3024/// checkboxes, and image buttons. This helper bakes the normal appearance onto
3025/// the page content and removes only the widgets that were successfully
3026/// flattened. Returns the number of widgets flattened.
3027fn flatten_widget_appearances(doc: &mut Document) -> usize {
3028    let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3029    let mut flattened = 0usize;
3030
3031    for page_id in page_ids {
3032        let annots = page_annotations(doc, page_id);
3033        if annots.is_empty() {
3034            continue;
3035        }
3036
3037        let mut retained = Vec::new();
3038        let mut overlay_ops = Vec::new();
3039
3040        for annot in annots {
3041            let Some(annot_id) = annot.as_reference().ok() else {
3042                retained.push(annot);
3043                continue;
3044            };
3045
3046            let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
3047                retained.push(annot);
3048                continue;
3049            };
3050
3051            let is_widget = annot_dict
3052                .get(b"Subtype")
3053                .ok()
3054                .and_then(|obj| obj.as_name().ok())
3055                == Some(&b"Widget"[..]);
3056            if !is_widget {
3057                retained.push(annot);
3058                continue;
3059            }
3060
3061            let Some(rect) = annotation_rect(&annot_dict) else {
3062                retained.push(Object::Reference(annot_id));
3063                continue;
3064            };
3065            let Some(ap_id) = resolve_widget_normal_appearance(doc, &annot_dict) else {
3066                retained.push(Object::Reference(annot_id));
3067                continue;
3068            };
3069
3070            let xobject_name = format!("XfaAp{}", flattened);
3071            add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
3072            write_ops(
3073                &mut overlay_ops,
3074                format_args!(
3075                    "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
3076                    rect[0], rect[1], xobject_name
3077                ),
3078            );
3079            flattened += 1;
3080        }
3081
3082        if overlay_ops.is_empty() {
3083            continue;
3084        }
3085
3086        append_to_page_content(doc, page_id, &overlay_ops);
3087        set_page_annotations(doc, page_id, retained);
3088    }
3089
3090    flattened
3091}
3092
3093/// Remove Widget annotations from a page, keeping non-Widget annotations.
3094fn strip_widget_annotations(doc: &mut Document, page_id: ObjectId) {
3095    let annots = page_annotations(doc, page_id);
3096    if annots.is_empty() {
3097        return;
3098    }
3099    let mut retained = Vec::new();
3100    for annot in &annots {
3101        let is_widget = annot
3102            .as_reference()
3103            .ok()
3104            .and_then(|id| doc.get_dictionary(id).ok())
3105            .and_then(|d| d.get(b"Subtype").ok())
3106            .and_then(|obj| obj.as_name().ok())
3107            == Some(&b"Widget"[..]);
3108        if !is_widget {
3109            retained.push(annot.clone());
3110        }
3111    }
3112    set_page_annotations(doc, page_id, retained);
3113}
3114
3115fn page_annotations(doc: &Document, page_id: ObjectId) -> Vec<Object> {
3116    let Ok(page_dict) = doc.get_dictionary(page_id) else {
3117        return Vec::new();
3118    };
3119
3120    match page_dict.get(b"Annots") {
3121        Ok(Object::Array(arr)) => arr.clone(),
3122        Ok(Object::Reference(id)) => doc
3123            .get_object(*id)
3124            .ok()
3125            .and_then(|obj| obj.as_array().ok().cloned())
3126            .unwrap_or_default(),
3127        _ => Vec::new(),
3128    }
3129}
3130
3131fn set_page_annotations(doc: &mut Document, page_id: ObjectId, annots: Vec<Object>) {
3132    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3133        if annots.is_empty() {
3134            page_dict.remove(b"Annots");
3135        } else {
3136            page_dict.set("Annots", Object::Array(annots));
3137        }
3138    }
3139}
3140
3141fn annotation_rect(dict: &Dictionary) -> Option<[f32; 4]> {
3142    let rect = dict.get(b"Rect").ok()?.as_array().ok()?;
3143    if rect.len() != 4 {
3144        return None;
3145    }
3146    Some([
3147        rect[0].as_float().ok()?,
3148        rect[1].as_float().ok()?,
3149        rect[2].as_float().ok()?,
3150        rect[3].as_float().ok()?,
3151    ])
3152}
3153
3154fn resolve_widget_normal_appearance(
3155    doc: &mut Document,
3156    annot_dict: &Dictionary,
3157) -> Option<ObjectId> {
3158    let ap = annot_dict.get(b"AP").ok()?.as_dict().ok()?;
3159    let normal = ap.get(b"N").ok()?;
3160    resolve_appearance_object(doc, annot_dict, normal)
3161}
3162
3163fn resolve_appearance_object(
3164    doc: &mut Document,
3165    annot_dict: &Dictionary,
3166    object: &Object,
3167) -> Option<ObjectId> {
3168    match object {
3169        Object::Reference(id) => match doc.get_object(*id).ok()?.clone() {
3170            Object::Stream(_) => Some(*id),
3171            Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, &states),
3172            _ => None,
3173        },
3174        Object::Stream(stream) => Some(doc.add_object(Object::Stream(stream.clone()))),
3175        Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, states),
3176        _ => None,
3177    }
3178}
3179
3180fn resolve_appearance_state(
3181    doc: &mut Document,
3182    annot_dict: &Dictionary,
3183    states: &Dictionary,
3184) -> Option<ObjectId> {
3185    if let Some(state) = selected_widget_state(annot_dict) {
3186        if let Ok(object) = states.get(state) {
3187            if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3188                return Some(id);
3189            }
3190        }
3191        if state == b"Off" {
3192            // Previously assumed oracle always stamps on-mark for hybrid XFA widgets.
3193            // Corrected per GL-WF-01/M#55: /AS state from source data is authoritative.
3194            return None;
3195        }
3196    }
3197
3198    for fallback in [b"Yes".as_slice(), b"On".as_slice(), b"Off".as_slice()] {
3199        if let Ok(object) = states.get(fallback) {
3200            if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3201                return Some(id);
3202            }
3203        }
3204    }
3205
3206    for (_name, object) in states.iter() {
3207        if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3208            return Some(id);
3209        }
3210    }
3211
3212    None
3213}
3214
3215fn selected_widget_state(annot_dict: &Dictionary) -> Option<&[u8]> {
3216    annot_dict
3217        .get(b"AS")
3218        .ok()
3219        .and_then(|obj| obj.as_name().ok())
3220        .or_else(|| annot_dict.get(b"V").ok().and_then(|obj| obj.as_name().ok()))
3221}
3222
3223fn add_xobject_to_page_resources(
3224    doc: &mut Document,
3225    page_id: ObjectId,
3226    name: &str,
3227    xobject_id: ObjectId,
3228) {
3229    let resources_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3230        page_dict
3231            .get(b"Resources")
3232            .ok()
3233            .and_then(|obj| obj.as_reference().ok())
3234    });
3235
3236    if let Some(resources_id) = resources_ref {
3237        let xobject_ref = doc.get_dictionary(resources_id).ok().and_then(|resources| {
3238            resources
3239                .get(b"XObject")
3240                .ok()
3241                .and_then(|obj| obj.as_reference().ok())
3242        });
3243
3244        if let Some(xobject_dict_id) = xobject_ref {
3245            if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3246                xobjects.set(name, Object::Reference(xobject_id));
3247                return;
3248            }
3249        }
3250
3251        if let Ok(Object::Dictionary(ref mut resources)) = doc.get_object_mut(resources_id) {
3252            add_xobject_to_resources_dict(resources, name, xobject_id);
3253            return;
3254        }
3255    }
3256
3257    let inline_xobject_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3258        page_dict
3259            .get(b"Resources")
3260            .ok()
3261            .and_then(|obj| obj.as_dict().ok())
3262            .and_then(|resources| {
3263                resources
3264                    .get(b"XObject")
3265                    .ok()
3266                    .and_then(|obj| obj.as_reference().ok())
3267            })
3268    });
3269
3270    if let Some(xobject_dict_id) = inline_xobject_ref {
3271        if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3272            xobjects.set(name, Object::Reference(xobject_id));
3273            return;
3274        }
3275    }
3276
3277    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3278        if let Ok(Object::Dictionary(ref mut resources)) = page_dict.get_mut(b"Resources") {
3279            add_xobject_to_resources_dict(resources, name, xobject_id);
3280            return;
3281        }
3282
3283        let mut resources = Dictionary::new();
3284        add_xobject_to_resources_dict(&mut resources, name, xobject_id);
3285        page_dict.set("Resources", Object::Dictionary(resources));
3286    }
3287}
3288
3289fn add_xobject_to_resources_dict(resources: &mut Dictionary, name: &str, xobject_id: ObjectId) {
3290    if let Ok(Object::Dictionary(ref mut xobjects)) = resources.get_mut(b"XObject") {
3291        xobjects.set(name, Object::Reference(xobject_id));
3292    } else {
3293        let mut xobjects = Dictionary::new();
3294        xobjects.set(name, Object::Reference(xobject_id));
3295        resources.set("XObject", Object::Dictionary(xobjects));
3296    }
3297}
3298
3299fn append_to_page_content(doc: &mut Document, page_id: ObjectId, data: &[u8]) {
3300    let new_stream_id = doc.add_object(Object::Stream(Stream::new(dictionary! {}, data.to_vec())));
3301
3302    let contents = doc
3303        .get_dictionary(page_id)
3304        .ok()
3305        .and_then(|page_dict| page_dict.get(b"Contents").ok().cloned());
3306
3307    // Some PDFs store page /Contents as an indirect array of streams. Appending
3308    // by wrapping that array reference in another array creates nested content
3309    // arrays (`[ 1510 0 R 1574 0 R ]` where `1510 0 R` is itself an array),
3310    // which Poppler treats as "Weird page contents" and can blank the page.
3311    // Flatten the existing /Contents tree first so preserve-static/widget bake
3312    // paths remain valid on Adobe-generated forms like 697eeb9f.
3313    let new_contents = match contents {
3314        Some(existing) => {
3315            let mut flattened = Vec::new();
3316            flatten_page_contents_entries(doc, existing, &mut flattened);
3317            flattened.push(Object::Reference(new_stream_id));
3318            if flattened.len() == 1 {
3319                flattened.pop().unwrap()
3320            } else {
3321                Object::Array(flattened)
3322            }
3323        }
3324        None => Object::Reference(new_stream_id),
3325    };
3326
3327    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3328        page_dict.set("Contents", new_contents);
3329    }
3330}
3331
3332fn flatten_page_contents_entries(doc: &mut Document, object: Object, out: &mut Vec<Object>) {
3333    match object {
3334        Object::Reference(id) => match doc.get_object(id).cloned() {
3335            Ok(Object::Array(items)) => {
3336                for item in items {
3337                    flatten_page_contents_entries(doc, item, out);
3338                }
3339            }
3340            _ => out.push(Object::Reference(id)),
3341        },
3342        Object::Array(items) => {
3343            for item in items {
3344                flatten_page_contents_entries(doc, item, out);
3345            }
3346        }
3347        Object::Stream(stream) => {
3348            let stream_id = doc.add_object(Object::Stream(stream));
3349            out.push(Object::Reference(stream_id));
3350        }
3351        other => out.push(other),
3352    }
3353}
3354
3355/// Remove Widget annotations from all pages and strip /AcroForm from the catalog.
3356///
3357/// This is the "static-strip" flatten path used for hybrid XFA+static PDFs:
3358/// the original page content is preserved and only the interactive XFA/AcroForm
3359/// layer is removed.
3360fn strip_widgets_and_acroform(doc: &mut Document) {
3361    remove_acroform(doc);
3362}
3363
3364/// Replace a page's /Contents stream with XFA overlay bytes and add font resource.
3365fn write_page_content(
3366    doc: &mut Document,
3367    page_id: ObjectId,
3368    overlay: &PageOverlay,
3369    font_ids: &[ObjectId; 3],
3370    embedded_fonts: &[(String, ObjectId)],
3371    page_width: Option<f64>,
3372    page_height: Option<f64>,
3373) -> Result<()> {
3374    let mut resources = make_resources_dict(font_ids, embedded_fonts);
3375
3376    let mut xobjects = Dictionary::new();
3377    for img in &overlay.images {
3378        match embed_image(doc, &img.data, &img.mime_type) {
3379            Ok(result) => {
3380                xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3381            }
3382            Err(e) => {
3383                eprintln!("failed to embed image {}: {}", img.name, e);
3384            }
3385        }
3386    }
3387    if !xobjects.is_empty() {
3388        resources.set("XObject", Object::Dictionary(xobjects));
3389    }
3390
3391    let stream = Stream::new(
3392        dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
3393        overlay.content_stream.clone(),
3394    );
3395    let stream_id = doc.add_object(Object::Stream(stream));
3396
3397    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3398        page_dict.set("Contents", Object::Reference(stream_id));
3399        page_dict.set("Resources", Object::Dictionary(resources));
3400        // Update MediaBox to match the XFA layout page dimensions.
3401        // Dynamic XFA forms often have a placeholder page with different
3402        // dimensions than the template's pageArea (e.g. letter vs A4).
3403        if let (Some(w), Some(h)) = (page_width, page_height) {
3404            page_dict.set(
3405                "MediaBox",
3406                Object::Array(vec![
3407                    Object::Real(0.0),
3408                    Object::Real(0.0),
3409                    Object::Real(w as f32),
3410                    Object::Real(h as f32),
3411                ]),
3412            );
3413        }
3414    }
3415    Ok(())
3416}
3417
3418/// Overlay XFA content on top of existing page content (for static XFA forms).
3419///
3420/// Unlike `write_page_content` which replaces the page content entirely, this
3421/// preserves the original content stream and appends the XFA overlay on top.
3422/// The original resources are preserved and XFA font resources are merged in.
3423fn overlay_page_content(
3424    doc: &mut Document,
3425    page_id: ObjectId,
3426    overlay: &PageOverlay,
3427    font_ids: &[ObjectId; 3],
3428    embedded_fonts: &[(String, ObjectId)],
3429) -> Result<()> {
3430    let xfa_resources = make_resources_dict(font_ids, embedded_fonts);
3431
3432    let mut xfa_xobjects = Dictionary::new();
3433    for img in &overlay.images {
3434        match embed_image(doc, &img.data, &img.mime_type) {
3435            Ok(result) => {
3436                xfa_xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3437            }
3438            Err(e) => {
3439                eprintln!("failed to embed image {}: {}", img.name, e);
3440            }
3441        }
3442    }
3443
3444    merge_xfa_resources_into_page(doc, page_id, &xfa_resources, &xfa_xobjects);
3445
3446    if !overlay.content_stream.is_empty() {
3447        append_to_page_content(doc, page_id, &overlay.content_stream);
3448    }
3449
3450    Ok(())
3451}
3452
3453/// Merge XFA font/xobject resources into the existing page resources without
3454/// overwriting original entries.
3455fn merge_xfa_resources_into_page(
3456    doc: &mut Document,
3457    page_id: ObjectId,
3458    xfa_resources: &Dictionary,
3459    xfa_xobjects: &Dictionary,
3460) {
3461    let existing_resources = doc
3462        .get_dictionary(page_id)
3463        .ok()
3464        .and_then(|page_dict| {
3465            page_dict.get(b"Resources").ok().and_then(|obj| match obj {
3466                Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
3467                Object::Dictionary(d) => Some(d.clone()),
3468                _ => None,
3469            })
3470        })
3471        .unwrap_or_default();
3472
3473    let mut merged = existing_resources;
3474
3475    // Merge Font entries: add XFA fonts (F1, F2, F3, embedded) without
3476    // overwriting the page's own fonts.
3477    if let Ok(xfa_font_dict) = xfa_resources.get(b"Font").and_then(|o| o.as_dict()) {
3478        let existing_font = merged
3479            .get(b"Font")
3480            .ok()
3481            .and_then(|obj| match obj {
3482                Object::Dictionary(d) => Some(d.clone()),
3483                Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
3484                _ => None,
3485            })
3486            .unwrap_or_default();
3487
3488        let mut font_merged = existing_font;
3489        for (key, val) in xfa_font_dict.iter() {
3490            if font_merged.get(key).is_err() {
3491                font_merged.set(key.clone(), val.clone());
3492            }
3493        }
3494        merged.set("Font", Object::Dictionary(font_merged));
3495    }
3496
3497    // Merge XObject entries.
3498    if !xfa_xobjects.is_empty() {
3499        let existing_xobj = merged
3500            .get(b"XObject")
3501            .ok()
3502            .and_then(|obj| match obj {
3503                Object::Dictionary(d) => Some(d.clone()),
3504                Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
3505                _ => None,
3506            })
3507            .unwrap_or_default();
3508
3509        let mut xobj_merged = existing_xobj;
3510        for (key, val) in xfa_xobjects.iter() {
3511            xobj_merged.set(key.clone(), val.clone());
3512        }
3513        merged.set("XObject", Object::Dictionary(xobj_merged));
3514    }
3515
3516    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3517        page_dict.set("Resources", Object::Dictionary(merged));
3518    }
3519}
3520
3521/// Add a new page to the document's /Pages tree.
3522fn add_new_page(
3523    doc: &mut Document,
3524    w: f64,
3525    h: f64,
3526    overlay: &PageOverlay,
3527    font_ids: &[ObjectId; 3],
3528    embedded_fonts: &[(String, ObjectId)],
3529) -> Result<()> {
3530    let mut resources = make_resources_dict(font_ids, embedded_fonts);
3531
3532    let mut xobjects = Dictionary::new();
3533    for img in &overlay.images {
3534        match embed_image(doc, &img.data, &img.mime_type) {
3535            Ok(result) => {
3536                xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3537            }
3538            Err(e) => {
3539                eprintln!("failed to embed image {}: {}", img.name, e);
3540            }
3541        }
3542    }
3543    if !xobjects.is_empty() {
3544        resources.set("XObject", Object::Dictionary(xobjects));
3545    }
3546
3547    let stream = Stream::new(
3548        dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
3549        overlay.content_stream.clone(),
3550    );
3551    let stream_id = doc.add_object(Object::Stream(stream));
3552
3553    // Find the /Pages root to append to.
3554    let pages_id = find_pages_root(doc)?;
3555
3556    let page_id = doc.add_object(Object::Dictionary(dictionary! {
3557        "Type"      => Object::Name(b"Page".to_vec()),
3558        "Parent"    => Object::Reference(pages_id),
3559        "MediaBox"  => Object::Array(vec![
3560            Object::Integer(0), Object::Integer(0),
3561            Object::Real(w as f32), Object::Real(h as f32),
3562        ]),
3563        "Contents"  => Object::Reference(stream_id),
3564        "Resources" => Object::Dictionary(resources)
3565    }));
3566
3567    // Append to /Kids and increment /Count.
3568    if let Ok(Object::Dictionary(ref mut pages_dict)) = doc.get_object_mut(pages_id) {
3569        if let Ok(Object::Array(ref mut kids)) = pages_dict.get_mut(b"Kids") {
3570            kids.push(Object::Reference(page_id));
3571        }
3572        if let Ok(Object::Integer(ref mut count)) = pages_dict.get_mut(b"Count") {
3573            *count += 1;
3574        }
3575    }
3576    Ok(())
3577}
3578
3579fn make_resources_dict(
3580    font_ids: &[ObjectId; 3],
3581    embedded_fonts: &[(String, ObjectId)],
3582) -> Dictionary {
3583    let mut fonts = Dictionary::new();
3584    fonts.set("F1", Object::Reference(font_ids[0]));
3585    fonts.set("F2", Object::Reference(font_ids[1]));
3586    fonts.set("F3", Object::Reference(font_ids[2]));
3587    for (name, obj_id) in embedded_fonts {
3588        fonts.set(name.as_str(), Object::Reference(*obj_id));
3589    }
3590    let mut resources = Dictionary::new();
3591    resources.set("Font", Object::Dictionary(fonts));
3592    resources
3593}
3594
3595fn find_pages_root(doc: &Document) -> Result<ObjectId> {
3596    let root_id = doc
3597        .trailer
3598        .get(b"Root")
3599        .ok()
3600        .and_then(|o: &Object| o.as_reference().ok())
3601        .ok_or_else(|| XfaError::LoadFailed("no /Root in trailer".to_string()))?;
3602    let catalog = doc
3603        .get_dictionary(root_id)
3604        .map_err(|e| XfaError::LoadFailed(format!("catalog: {e}")))?;
3605    catalog
3606        .get(b"Pages")
3607        .ok()
3608        .and_then(|o: &Object| o.as_reference().ok())
3609        .ok_or_else(|| XfaError::LoadFailed("no /Pages in catalog".to_string()))
3610}
3611
3612/// Remove all interactive XFA/AcroForm artifacts from the PDF document.
3613///
3614/// XFA-F6-02 (#1110): this function ensures the output is a clean static PDF
3615/// with no residual interactive form markers. Steps performed:
3616///
3617/// 1. Remove `/AcroForm` from the catalog.
3618/// 2. Remove `/NeedsRendering` from the catalog.
3619/// 3. Remove `/XFA` from the AcroForm dictionary (if it was an indirect object
3620///    whose dict still exists in the object table).
3621/// 4. Remove orphaned XFA packet objects and the unreachable AcroForm object
3622///    from the lopdf object table.
3623/// 5. Remove widget annotations from all page `/Annots` arrays.
3624/// 6. Remove empty `/Annots` arrays left behind after widget removal.
3625fn remove_acroform(doc: &mut Document) {
3626    let root_id = match doc.trailer.get(b"Root") {
3627        Ok(Object::Reference(id)) => *id,
3628        _ => return,
3629    };
3630
3631    // Step 1 & 2: remove /AcroForm and /NeedsRendering from catalog.
3632    // Also capture the AcroForm object ID so we can clean up /XFA inside it.
3633    let acroform_id: Option<ObjectId> = {
3634        if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
3635            let acroform_ref = dict.get(b"AcroForm").ok().and_then(|o| {
3636                if let Object::Reference(id) = o {
3637                    Some(*id)
3638                } else {
3639                    None
3640                }
3641            });
3642            dict.remove(b"AcroForm");
3643            dict.remove(b"NeedsRendering");
3644            acroform_ref
3645        } else {
3646            None
3647        }
3648    };
3649
3650    // Step 3: collect /XFA stream object IDs, then remove /XFA from the
3651    // AcroForm dictionary object.
3652    let xfa_stream_ids: Vec<ObjectId> = acroform_id
3653        .and_then(|af_id| doc.get_dictionary(af_id).ok())
3654        .map(|af_dict| match af_dict.get(b"XFA") {
3655            Ok(Object::Array(arr)) => arr
3656                .iter()
3657                .filter_map(|o| {
3658                    if let Object::Reference(id) = o {
3659                        Some(*id)
3660                    } else {
3661                        None
3662                    }
3663                })
3664                .collect(),
3665            Ok(Object::Reference(id)) => vec![*id],
3666            _ => Vec::new(),
3667        })
3668        .unwrap_or_default();
3669
3670    if let Some(af_id) = acroform_id {
3671        if let Ok(Object::Dictionary(ref mut af_dict)) = doc.get_object_mut(af_id) {
3672            af_dict.remove(b"XFA");
3673        }
3674    }
3675
3676    // Step 4 (FSC-05): purge orphaned XFA packet objects and the unreachable
3677    // AcroForm dictionary from the object table. lopdf serializes every object
3678    // still present in doc.objects, even if the catalog no longer references it.
3679    for stream_id in xfa_stream_ids {
3680        doc.objects.remove(&stream_id);
3681    }
3682    if let Some(af_id) = acroform_id {
3683        doc.objects.remove(&af_id);
3684    }
3685
3686    // Step 5 & 6: remove widget annotations from every page's /Annots array,
3687    // then drop empty /Annots arrays entirely.
3688    let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3689    for page_id in page_ids {
3690        strip_widget_annotations(doc, page_id);
3691    }
3692}
3693
3694// ---------------------------------------------------------------------------
3695// XFA-F6-03 (#1111): Post-flatten validation
3696// ---------------------------------------------------------------------------
3697
3698/// Result of a post-flatten validation pass.
3699///
3700/// All `has_no_*` fields should be `true` for a clean flat PDF. Any
3701/// remaining XFA artifacts are reported in `warnings`.
3702pub struct FlattenValidation {
3703    /// True when the catalog contains no `/XFA` entry (directly or via AcroForm).
3704    pub has_no_xfa: bool,
3705    /// True when the catalog contains no `/NeedsRendering` entry.
3706    pub has_no_needs_rendering: bool,
3707    /// True when the catalog contains no `/AcroForm` entry.
3708    pub has_no_acroform: bool,
3709    /// Number of pages in the output PDF.
3710    pub page_count: usize,
3711    /// Human-readable warnings for each detected XFA artifact.
3712    pub warnings: Vec<String>,
3713}
3714
3715/// Validate that a PDF has been fully flattened (no XFA/AcroForm artifacts remain).
3716///
3717/// Returns a [`FlattenValidation`] summary. Call after [`flatten_xfa_to_pdf`] to
3718/// confirm the output is clean.
3719///
3720/// This function never panics — parse failures produce a validation result with
3721/// all `has_no_*` fields set to `false` and a warning explaining the parse error.
3722pub fn validate_flattened_pdf(pdf_bytes: &[u8]) -> Result<FlattenValidation> {
3723    if pdf_bytes.is_empty() {
3724        return Ok(FlattenValidation {
3725            has_no_xfa: true,
3726            has_no_needs_rendering: true,
3727            has_no_acroform: true,
3728            page_count: 0,
3729            warnings: vec!["empty input — no PDF to validate".into()],
3730        });
3731    }
3732
3733    let doc = match Document::load_mem(pdf_bytes) {
3734        Ok(d) => d,
3735        Err(e) => {
3736            return Ok(FlattenValidation {
3737                has_no_xfa: false,
3738                has_no_needs_rendering: false,
3739                has_no_acroform: false,
3740                page_count: 0,
3741                warnings: vec![format!("could not parse PDF: {e}")],
3742            });
3743        }
3744    };
3745
3746    let mut warnings = Vec::new();
3747    let mut has_no_xfa = true;
3748    let mut has_no_needs_rendering = true;
3749    let mut has_no_acroform = true;
3750
3751    // Check catalog for AcroForm, NeedsRendering, and XFA.
3752    let root_id = doc.trailer.get(b"Root").ok().and_then(|o| {
3753        if let Object::Reference(id) = o {
3754            Some(*id)
3755        } else {
3756            None
3757        }
3758    });
3759
3760    if let Some(rid) = root_id {
3761        if let Ok(catalog) = doc.get_dictionary(rid) {
3762            if catalog.get(b"AcroForm").is_ok() {
3763                has_no_acroform = false;
3764                warnings.push("/AcroForm still present in catalog".into());
3765
3766                // Check whether the AcroForm dict contains /XFA.
3767                let acroform_has_xfa = catalog
3768                    .get(b"AcroForm")
3769                    .ok()
3770                    .and_then(|o| match o {
3771                        Object::Reference(id) => doc.get_dictionary(*id).ok(),
3772                        Object::Dictionary(d) => Some(d),
3773                        _ => None,
3774                    })
3775                    .map(|d| d.get(b"XFA").is_ok())
3776                    .unwrap_or(false);
3777
3778                if acroform_has_xfa {
3779                    has_no_xfa = false;
3780                    warnings.push("/XFA still present in AcroForm dictionary".into());
3781                }
3782            }
3783
3784            if catalog.get(b"NeedsRendering").is_ok() {
3785                has_no_needs_rendering = false;
3786                warnings.push("/NeedsRendering still present in catalog".into());
3787            }
3788
3789            // Direct /XFA on catalog (non-standard but possible).
3790            if catalog.get(b"XFA").is_ok() {
3791                has_no_xfa = false;
3792                warnings.push("/XFA still present directly in catalog".into());
3793            }
3794        }
3795    }
3796
3797    // Check page annotations for widget annotations.
3798    let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3799    let page_count = page_ids.len();
3800    for page_id in page_ids {
3801        for annot_obj in page_annotations(&doc, page_id) {
3802            let is_widget = annot_obj
3803                .as_reference()
3804                .ok()
3805                .and_then(|id| doc.get_dictionary(id).ok())
3806                .and_then(|d| {
3807                    d.get(b"Subtype")
3808                        .ok()
3809                        .map(|st| st == &Object::Name(b"Widget".to_vec()))
3810                })
3811                .unwrap_or(false);
3812            if is_widget {
3813                warnings.push(format!(
3814                    "widget annotation found on page (object {:?})",
3815                    annot_obj
3816                ));
3817            }
3818        }
3819    }
3820
3821    Ok(FlattenValidation {
3822        has_no_xfa,
3823        has_no_needs_rendering,
3824        has_no_acroform,
3825        page_count,
3826        warnings,
3827    })
3828}
3829
3830// ---------------------------------------------------------------------------
3831// XFA-F6-04 (#1112): Flatten quality metrics
3832// ---------------------------------------------------------------------------
3833
3834/// Metrics comparing a PDF before and after flattening.
3835///
3836/// Used by [`compare_flatten_quality`] and the `flatten-check` CLI subcommand.
3837pub struct FlattenQualityMetrics {
3838    /// Number of pages in the original (pre-flatten) PDF.
3839    pub page_count_before: usize,
3840    /// Number of pages in the flattened (post-flatten) PDF.
3841    pub page_count_after: usize,
3842    /// True when `page_count_before == page_count_after`.
3843    pub page_count_match: bool,
3844    /// Total byte length of all content streams in the original PDF.
3845    pub content_stream_bytes_before: usize,
3846    /// Total byte length of all content streams in the flattened PDF.
3847    pub content_stream_bytes_after: usize,
3848    /// Ratio of after/before content stream bytes. 1.0 = same size, <1.0 = smaller.
3849    /// Returns 1.0 when `content_stream_bytes_before == 0` to avoid division by zero.
3850    pub content_ratio: f64,
3851}
3852
3853/// Compute quality metrics comparing the original PDF to its flattened version.
3854///
3855/// Parses both byte slices and compares page count and total content stream size.
3856/// Returns an error only if both PDFs fail to parse.
3857pub fn compare_flatten_quality(
3858    original_bytes: &[u8],
3859    flattened_bytes: &[u8],
3860) -> Result<FlattenQualityMetrics> {
3861    fn count_pages_and_stream_bytes(pdf_bytes: &[u8]) -> (usize, usize) {
3862        let doc = match Document::load_mem(pdf_bytes) {
3863            Ok(d) => d,
3864            Err(_) => return (0, 0),
3865        };
3866        let page_count = doc.page_iter().count();
3867        let stream_bytes: usize = doc
3868            .objects
3869            .values()
3870            .filter_map(|obj| {
3871                if let Object::Stream(s) = obj {
3872                    // Use decompressed content length when available.
3873                    s.content.len().into()
3874                } else {
3875                    None
3876                }
3877            })
3878            .sum();
3879        (page_count, stream_bytes)
3880    }
3881
3882    let (page_count_before, content_stream_bytes_before) =
3883        count_pages_and_stream_bytes(original_bytes);
3884    let (page_count_after, content_stream_bytes_after) =
3885        count_pages_and_stream_bytes(flattened_bytes);
3886
3887    let content_ratio = if content_stream_bytes_before == 0 {
3888        1.0_f64
3889    } else {
3890        content_stream_bytes_after as f64 / content_stream_bytes_before as f64
3891    };
3892
3893    Ok(FlattenQualityMetrics {
3894        page_count_before,
3895        page_count_after,
3896        page_count_match: page_count_before == page_count_after,
3897        content_stream_bytes_before,
3898        content_stream_bytes_after,
3899        content_ratio,
3900    })
3901}
3902
3903// ---------------------------------------------------------------------------
3904// XFA-F7-02 (#1114): Text completeness validation
3905// ---------------------------------------------------------------------------
3906
3907/// Result of a text completeness validation pass.
3908///
3909/// Compares the data values bound in the original XFA datasets against the
3910/// text content extracted from the flattened PDF to verify all field values
3911/// appear in the output.
3912pub struct TextValidation {
3913    /// Data values extracted from the original XFA datasets XML.
3914    pub expected_values: Vec<String>,
3915    /// Values from `expected_values` that were found in the output text.
3916    pub found_values: Vec<String>,
3917    /// Values from `expected_values` that were NOT found in the output text.
3918    pub missing_values: Vec<String>,
3919    /// Ratio of found/expected. 1.0 means all expected values are present.
3920    /// Returns 1.0 when `expected_values` is empty (nothing to check).
3921    pub completeness_ratio: f64,
3922}
3923
3924/// Extract all non-empty text node values from XFA `<field>` elements in the
3925/// datasets XML packet.
3926fn extract_field_values_from_datasets(datasets_xml: &str) -> Vec<String> {
3927    // Minimal parser: locate every <field …> … </field> block and grab direct
3928    // text content (the value node inside).  We keep this dependency-free by
3929    // doing a simple byte-scan rather than pulling in an XML parser.
3930    let mut values = Vec::new();
3931    let mut remaining = datasets_xml;
3932
3933    while let Some(open_pos) = remaining.find("<field") {
3934        // Advance past the opening tag itself (up to the closing `>`).
3935        let tag_end = match remaining[open_pos..].find('>') {
3936            Some(p) => open_pos + p + 1,
3937            None => break,
3938        };
3939
3940        // Self-closing tag (<field … />) — no value.
3941        if remaining[open_pos..tag_end].ends_with("/>") {
3942            remaining = &remaining[tag_end..];
3943            continue;
3944        }
3945
3946        // Find the matching </field>.
3947        let close_tag = "</field>";
3948        match remaining[tag_end..].find(close_tag) {
3949            Some(close_pos) => {
3950                let inner = &remaining[tag_end..tag_end + close_pos];
3951                // Extract text from a nested <value><text>…</text></value> or
3952                // just plain text between the tags.
3953                let text = extract_innermost_text(inner);
3954                if !text.is_empty() {
3955                    values.push(text);
3956                }
3957                remaining = &remaining[tag_end + close_pos + close_tag.len()..];
3958            }
3959            None => break,
3960        }
3961    }
3962    values
3963}
3964
3965/// Given the inner content of a `<field>` element, return the first non-empty
3966/// text value found (handles `<value><text>…</text></value>` nesting).
3967fn extract_innermost_text(inner: &str) -> String {
3968    // Try <text>…</text> first.
3969    if let Some(start) = inner.find("<text>") {
3970        let content_start = start + "<text>".len();
3971        if let Some(end) = inner[content_start..].find("</text>") {
3972            let s = inner[content_start..content_start + end].trim().to_string();
3973            if !s.is_empty() {
3974                return s;
3975            }
3976        }
3977    }
3978    // Fall back to stripping all XML tags and returning the trimmed text.
3979    let stripped = strip_xml_tags(inner);
3980    stripped.trim().to_string()
3981}
3982
3983/// Remove XML/HTML tags from a string, returning only the text content.
3984fn strip_xml_tags(s: &str) -> String {
3985    let mut out = String::with_capacity(s.len());
3986    let mut in_tag = false;
3987    for ch in s.chars() {
3988        match ch {
3989            '<' => in_tag = true,
3990            '>' => in_tag = false,
3991            _ if !in_tag => out.push(ch),
3992            _ => {}
3993        }
3994    }
3995    out
3996}
3997
3998/// Extract visible text from a PDF content stream by scanning for the `Tj`,
3999/// `TJ`, `'`, and `"` text-showing operators.
4000///
4001/// This is a best-effort scan of the raw (potentially un-decoded) bytes.
4002/// It does not handle all encodings or compressed streams but is sufficient
4003/// for validating that literal ASCII/Latin text values are present.
4004fn extract_text_from_pdf_bytes(pdf_bytes: &[u8]) -> String {
4005    let doc = match Document::load_mem(pdf_bytes) {
4006        Ok(d) => d,
4007        Err(_) => return String::new(),
4008    };
4009
4010    let mut text = String::new();
4011
4012    for obj in doc.objects.values() {
4013        if let Object::Stream(ref stream) = obj {
4014            // Read raw stream content (decompression may fail silently).
4015            let content = match stream.decompressed_content() {
4016                Ok(c) => c,
4017                Err(_) => stream.content.clone(),
4018            };
4019            let fragment = extract_text_from_content_stream(&content);
4020            if !fragment.is_empty() {
4021                text.push(' ');
4022                text.push_str(&fragment);
4023            }
4024        }
4025    }
4026    text
4027}
4028
4029/// Scan a PDF content stream byte slice for string operands attached to
4030/// text-showing operators (Tj, TJ, ', ").
4031fn extract_text_from_content_stream(content: &[u8]) -> String {
4032    let s = String::from_utf8_lossy(content);
4033    let mut result = String::new();
4034
4035    // Find parenthesis-delimited strings: (…) followed optionally by whitespace
4036    // and then one of the text operators.
4037    for (i, ch) in s.char_indices() {
4038        if ch == '(' {
4039            // Collect until matching ')'.
4040            let start = i + 1;
4041            let mut depth: i32 = 1;
4042            let mut end = start;
4043            let bytes = s.as_bytes();
4044            while end < bytes.len() && depth > 0 {
4045                match bytes[end] {
4046                    b'(' => depth += 1,
4047                    b')' => depth -= 1,
4048                    b'\\' => {
4049                        end += 1; // skip escaped char
4050                    }
4051                    _ => {}
4052                }
4053                end += 1;
4054            }
4055            if depth == 0 {
4056                let literal = &s[start..end - 1];
4057                // Only collect printable ASCII — skip binary font strings.
4058                if literal.chars().all(|c| {
4059                    c.is_ascii()
4060                        && (c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation())
4061                }) {
4062                    let trimmed = literal.trim();
4063                    if !trimmed.is_empty() {
4064                        result.push(' ');
4065                        result.push_str(trimmed);
4066                    }
4067                }
4068            }
4069        }
4070    }
4071    result
4072}
4073
4074/// Validate that all data values bound in the original XFA form appear in the
4075/// text content of the flattened PDF.
4076///
4077/// Steps:
4078/// 1. Extract the XFA `datasets` packet from `original_xfa_bytes`.
4079/// 2. Parse all `<field>` values from the datasets XML.
4080/// 3. Scan the flattened PDF's content streams for those strings.
4081/// 4. Return a [`TextValidation`] with completeness metrics.
4082///
4083/// Returns `Ok` even when the datasets packet is absent or the XFA cannot be
4084/// parsed — in that case `expected_values` will be empty and
4085/// `completeness_ratio` will be `1.0`.
4086pub fn validate_text_completeness(
4087    original_xfa_bytes: &[u8],
4088    flattened_bytes: &[u8],
4089) -> crate::error::Result<TextValidation> {
4090    // Step 1: extract the datasets packet from the original XFA PDF.
4091    let packets = match crate::extract::extract_xfa_from_bytes(original_xfa_bytes.to_vec()) {
4092        Ok(p) => p,
4093        Err(_) => {
4094            // Cannot extract XFA — nothing to validate.
4095            return Ok(TextValidation {
4096                expected_values: vec![],
4097                found_values: vec![],
4098                missing_values: vec![],
4099                completeness_ratio: 1.0,
4100            });
4101        }
4102    };
4103
4104    let datasets_xml = match packets.datasets() {
4105        Some(ds) => ds.to_string(),
4106        None => {
4107            return Ok(TextValidation {
4108                expected_values: vec![],
4109                found_values: vec![],
4110                missing_values: vec![],
4111                completeness_ratio: 1.0,
4112            });
4113        }
4114    };
4115
4116    // Step 2: extract field values.
4117    let expected_values = extract_field_values_from_datasets(&datasets_xml);
4118
4119    if expected_values.is_empty() {
4120        return Ok(TextValidation {
4121            expected_values: vec![],
4122            found_values: vec![],
4123            missing_values: vec![],
4124            completeness_ratio: 1.0,
4125        });
4126    }
4127
4128    // Step 3: extract text from the flattened PDF.
4129    let output_text = extract_text_from_pdf_bytes(flattened_bytes);
4130
4131    // Step 4: check which expected values appear in the output.
4132    let mut found_values = Vec::new();
4133    let mut missing_values = Vec::new();
4134
4135    for value in &expected_values {
4136        if output_text.contains(value.as_str()) {
4137            found_values.push(value.clone());
4138        } else {
4139            missing_values.push(value.clone());
4140        }
4141    }
4142
4143    let completeness_ratio = if expected_values.is_empty() {
4144        1.0
4145    } else {
4146        found_values.len() as f64 / expected_values.len() as f64
4147    };
4148
4149    Ok(TextValidation {
4150        expected_values,
4151        found_values,
4152        missing_values,
4153        completeness_ratio,
4154    })
4155}
4156
4157// ---------------------------------------------------------------------------
4158// Tests
4159// ---------------------------------------------------------------------------
4160
4161/// Test helper (GL-QA36): simulate a re-entrant call to flatten_xfa_to_pdf by
4162/// pre-setting FLATTEN_DEPTH to 1 before calling.  This is used to verify the
4163/// recursion guard without exposing the thread-local to the test sub-module.
4164///
4165/// IMPORTANT: This function resets FLATTEN_DEPTH to 0 before returning so that
4166/// subsequent calls on the same thread are not affected.
4167#[cfg(test)]
4168fn flatten_xfa_to_pdf_simulate_reentrant(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
4169    FLATTEN_DEPTH.with(|d| d.set(1));
4170    let result = flatten_xfa_to_pdf(pdf_bytes);
4171    // Reset — the guard will have left depth at 1 because it detected depth>=1
4172    // and returned early before the DepthGuard could decrement.
4173    FLATTEN_DEPTH.with(|d| d.set(0));
4174    result
4175}
4176
4177#[cfg(test)]
4178mod tests {
4179    use super::*;
4180
4181    /// Build a minimal XFA PDF in memory (same as generate_xfa_layout_fixtures).
4182    fn build_xfa_pdf_with_content(xdp: &str, page_content: Vec<u8>) -> Vec<u8> {
4183        use lopdf::{dictionary, Document, Object, Stream};
4184        let mut doc = Document::with_version("1.4");
4185        let xdp_bytes = xdp.as_bytes().to_vec();
4186        let xfa_stream = Stream::new(
4187            dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4188            xdp_bytes,
4189        );
4190        let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4191        let pages_id = doc.new_object_id();
4192        let content_stream = Stream::new(
4193            dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4194            page_content,
4195        );
4196        let content_id = doc.add_object(Object::Stream(content_stream));
4197        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4198            "Type"     => Object::Name(b"Page".to_vec()),
4199            "Parent"   => Object::Reference(pages_id),
4200            "MediaBox" => Object::Array(vec![
4201                Object::Integer(0), Object::Integer(0),
4202                Object::Integer(612), Object::Integer(792),
4203            ]),
4204            "Contents" => Object::Reference(content_id)
4205        }));
4206        doc.objects.insert(
4207            pages_id,
4208            Object::Dictionary(dictionary! {
4209                "Type"  => Object::Name(b"Pages".to_vec()),
4210                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
4211                "Count" => Object::Integer(1)
4212            }),
4213        );
4214        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4215            "XFA"    => Object::Reference(xfa_id),
4216            "Fields" => Object::Array(vec![])
4217        }));
4218        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4219            "Type"     => Object::Name(b"Catalog".to_vec()),
4220            "Pages"    => Object::Reference(pages_id),
4221            "AcroForm" => Object::Reference(acroform_id)
4222        }));
4223        doc.trailer.set("Root", Object::Reference(catalog_id));
4224        let mut out = Vec::new();
4225        doc.save_to(&mut out).unwrap();
4226        out
4227    }
4228
4229    fn build_xfa_pdf(xdp: &str) -> Vec<u8> {
4230        build_xfa_pdf_with_content(xdp, Vec::new())
4231    }
4232
4233    fn build_xfa_doc_with_xfa_array() -> (Document, ObjectId, Vec<ObjectId>) {
4234        use lopdf::{dictionary, Document, Object, Stream};
4235
4236        let mut doc = Document::with_version("1.4");
4237        let pages_id = doc.new_object_id();
4238        let content_id = doc.add_object(Object::Stream(Stream::new(
4239            dictionary! { "Length" => Object::Integer(0) },
4240            Vec::new(),
4241        )));
4242        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4243            "Type"     => Object::Name(b"Page".to_vec()),
4244            "Parent"   => Object::Reference(pages_id),
4245            "MediaBox" => Object::Array(vec![
4246                Object::Integer(0), Object::Integer(0),
4247                Object::Integer(612), Object::Integer(792),
4248            ]),
4249            "Contents" => Object::Reference(content_id)
4250        }));
4251        doc.objects.insert(
4252            pages_id,
4253            Object::Dictionary(dictionary! {
4254                "Type"  => Object::Name(b"Pages".to_vec()),
4255                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
4256                "Count" => Object::Integer(1)
4257            }),
4258        );
4259
4260        let packet_payloads = [
4261            (
4262                b"xdp:xdp".to_vec(),
4263                br#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#.to_vec(),
4264            ),
4265            (
4266                b"template".to_vec(),
4267                br#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform/></template>"#
4268                    .to_vec(),
4269            ),
4270            (
4271                b"datasets".to_vec(),
4272                br#"<xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"></xfa:datasets>"#
4273                    .to_vec(),
4274            ),
4275        ];
4276
4277        let mut xfa_array = Vec::new();
4278        let mut xfa_ids = Vec::new();
4279        for (packet_name, payload) in packet_payloads {
4280            let stream_id = doc.add_object(Object::Stream(Stream::new(
4281                dictionary! { "Length" => Object::Integer(payload.len() as i64) },
4282                payload,
4283            )));
4284            xfa_array.push(Object::Name(packet_name));
4285            xfa_array.push(Object::Reference(stream_id));
4286            xfa_ids.push(stream_id);
4287        }
4288
4289        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4290            "XFA"    => Object::Array(xfa_array),
4291            "Fields" => Object::Array(vec![])
4292        }));
4293        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4294            "Type"     => Object::Name(b"Catalog".to_vec()),
4295            "Pages"    => Object::Reference(pages_id),
4296            "AcroForm" => Object::Reference(acroform_id)
4297        }));
4298        doc.trailer.set("Root", Object::Reference(catalog_id));
4299        (doc, acroform_id, xfa_ids)
4300    }
4301
4302    fn build_xfa_pdf_with_widget_appearance(
4303        page_content: Vec<u8>,
4304        normal_appearance: Object,
4305        widget_extra: Dictionary,
4306    ) -> Vec<u8> {
4307        use lopdf::{dictionary, Document, Object, Stream};
4308
4309        let mut doc = Document::with_version("1.4");
4310        let xdp_bytes = SIMPLE_XDP.as_bytes().to_vec();
4311        let xfa_stream = Stream::new(
4312            dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4313            xdp_bytes,
4314        );
4315        let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4316
4317        let pages_id = doc.new_object_id();
4318        let content_id = doc.add_object(Object::Stream(Stream::new(
4319            dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4320            page_content,
4321        )));
4322
4323        let appearance_id = match normal_appearance {
4324            Object::Reference(id) => id,
4325            other => doc.add_object(other),
4326        };
4327
4328        let widget_id = doc.new_object_id();
4329        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4330            "Type"     => Object::Name(b"Page".to_vec()),
4331            "Parent"   => Object::Reference(pages_id),
4332            "MediaBox" => Object::Array(vec![
4333                Object::Integer(0), Object::Integer(0),
4334                Object::Integer(612), Object::Integer(792),
4335            ]),
4336            "Contents" => Object::Reference(content_id),
4337            "Annots"   => Object::Array(vec![Object::Reference(widget_id)]),
4338            "Resources" => Object::Dictionary(dictionary! {})
4339        }));
4340
4341        let mut widget = dictionary! {
4342            "Type"    => Object::Name(b"Annot".to_vec()),
4343            "Subtype" => Object::Name(b"Widget".to_vec()),
4344            "Rect"    => Object::Array(vec![
4345                Object::Integer(100), Object::Integer(700),
4346                Object::Integer(220), Object::Integer(730),
4347            ]),
4348            "AP"      => Object::Dictionary(dictionary! {
4349                "N" => Object::Reference(appearance_id)
4350            }),
4351            "P"       => Object::Reference(page_id)
4352        };
4353        for (key, value) in widget_extra {
4354            widget.set(key, value);
4355        }
4356        doc.objects.insert(widget_id, Object::Dictionary(widget));
4357
4358        doc.objects.insert(
4359            pages_id,
4360            Object::Dictionary(dictionary! {
4361                "Type"  => Object::Name(b"Pages".to_vec()),
4362                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
4363                "Count" => Object::Integer(1)
4364            }),
4365        );
4366
4367        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4368            "XFA"    => Object::Reference(xfa_id),
4369            "Fields" => Object::Array(vec![Object::Reference(widget_id)])
4370        }));
4371        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4372            "Type"     => Object::Name(b"Catalog".to_vec()),
4373            "Pages"    => Object::Reference(pages_id),
4374            "AcroForm" => Object::Reference(acroform_id)
4375        }));
4376        doc.trailer.set("Root", Object::Reference(catalog_id));
4377
4378        let mut out = Vec::new();
4379        doc.save_to(&mut out).unwrap();
4380        out
4381    }
4382
4383    #[allow(dead_code)]
4384    fn find_last_content_stream<'a>(doc: &'a Document, page_id: ObjectId) -> &'a Stream {
4385        let page_dict = doc.get_dictionary(page_id).expect("page dict");
4386        match page_dict.get(b"Contents").expect("contents") {
4387            Object::Reference(id) => doc
4388                .get_object(*id)
4389                .expect("contents object")
4390                .as_stream()
4391                .expect("contents stream"),
4392            Object::Array(arr) => {
4393                let last = arr.last().expect("last content stream");
4394                let id = last.as_reference().expect("contents ref");
4395                doc.get_object(id)
4396                    .expect("contents object")
4397                    .as_stream()
4398                    .expect("contents stream")
4399            }
4400            other => other.as_stream().expect("contents stream"),
4401        }
4402    }
4403
4404    #[allow(dead_code)]
4405    fn page_xobjects(doc: &Document, page_id: ObjectId) -> Dictionary {
4406        let page_dict = doc.get_dictionary(page_id).expect("page dict");
4407        let resources = page_dict
4408            .get(b"Resources")
4409            .expect("resources")
4410            .as_dict()
4411            .expect("resources dict");
4412        resources
4413            .get(b"XObject")
4414            .expect("xobjects")
4415            .as_dict()
4416            .expect("xobject dict")
4417            .clone()
4418    }
4419
4420    #[test]
4421    fn append_to_page_content_flattens_indirect_contents_arrays() {
4422        let mut doc = Document::with_version("1.4");
4423        let pages_id = doc.new_object_id();
4424        let first_stream_id = doc.add_object(Stream::new(dictionary! {}, b"q\n".to_vec()));
4425        let second_stream_id = doc.add_object(Stream::new(dictionary! {}, b"Q\n".to_vec()));
4426        let contents_array_id = doc.add_object(Object::Array(vec![
4427            Object::Reference(first_stream_id),
4428            Object::Reference(second_stream_id),
4429        ]));
4430        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4431            "Type" => Object::Name(b"Page".to_vec()),
4432            "Parent" => Object::Reference(pages_id),
4433            "MediaBox" => Object::Array(vec![
4434                Object::Integer(0), Object::Integer(0),
4435                Object::Integer(612), Object::Integer(792),
4436            ]),
4437            "Contents" => Object::Reference(contents_array_id),
4438        }));
4439        doc.objects.insert(
4440            pages_id,
4441            Object::Dictionary(dictionary! {
4442                "Type" => Object::Name(b"Pages".to_vec()),
4443                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4444                "Count" => Object::Integer(1),
4445            }),
4446        );
4447
4448        append_to_page_content(&mut doc, page_id, b"BT\nET\n");
4449
4450        let page_dict = doc.get_dictionary(page_id).expect("page dict");
4451        let contents = page_dict.get(b"Contents").expect("contents");
4452        let items = contents.as_array().expect("flattened contents array");
4453
4454        assert_eq!(items.len(), 3, "existing streams + appended stream");
4455        assert!(
4456            items.iter().all(|obj| obj.as_reference().is_ok()),
4457            "contents array must stay flat and reference only streams"
4458        );
4459        for object in items {
4460            let stream_id = object.as_reference().expect("stream ref");
4461            assert!(
4462                doc.get_object(stream_id)
4463                    .expect("stream object")
4464                    .as_stream()
4465                    .is_ok(),
4466                "nested arrays must not survive in page contents"
4467            );
4468        }
4469    }
4470
4471    const SIMPLE_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
4472<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4473<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
4474  <subform name="form1" layout="paginate">
4475    <pageSet>
4476      <pageArea name="Page1">
4477        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4478        <medium stock="default" short="8.5in" long="11in"/>
4479      </pageArea>
4480    </pageSet>
4481    <subform name="section" layout="tb" w="7.5in">
4482      <field name="firstName" w="3.5in" h="0.3in">
4483        <caption><value><text>First Name</text></value></caption>
4484        <ui><textEdit/></ui>
4485        <value><text>John</text></value>
4486      </field>
4487    </subform>
4488  </subform>
4489</template>
4490</xdp:xdp>"#;
4491
4492    const JS_EVENT_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
4493<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4494<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
4495  <subform name="form1" layout="paginate">
4496    <pageSet>
4497      <pageArea name="Page1">
4498        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4499        <medium stock="default" short="8.5in" long="11in"/>
4500      </pageArea>
4501    </pageSet>
4502    <subform name="section" layout="tb" w="7.5in">
4503      <event activity="initialize">
4504        <script contentType="application/x-javascript">app.alert('blocked');</script>
4505      </event>
4506      <field name="firstName" w="3.5in" h="0.3in">
4507        <caption><value><text>First Name</text></value></caption>
4508        <ui><textEdit/></ui>
4509        <value><text>John</text></value>
4510      </field>
4511    </subform>
4512  </subform>
4513</template>
4514</xdp:xdp>"#;
4515
4516    fn overflowing_paginate_xdp(base_profile: Option<&str>) -> String {
4517        let mut fields = String::new();
4518        for i in 0..40 {
4519            fields.push_str(&format!(
4520                r#"
4521      <field name="line{i}" w="7.0in" h="0.3in">
4522        <ui><textEdit/></ui>
4523        <value><text>Line {i}</text></value>
4524      </field>"#
4525            ));
4526        }
4527
4528        let base_profile_attr = base_profile
4529            .map(|value| format!(r#" baseProfile="{value}""#))
4530            .unwrap_or_default();
4531
4532        format!(
4533            r#"<?xml version="1.0" encoding="UTF-8"?>
4534<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4535<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"{base_profile_attr}>
4536  <subform name="form1" layout="paginate">
4537    <pageSet>
4538      <pageArea name="Page1">
4539        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4540        <medium stock="default" short="8.5in" long="11in"/>
4541      </pageArea>
4542    </pageSet>
4543    <subform name="section" layout="tb" w="7.5in">{fields}
4544    </subform>
4545  </subform>
4546</template>
4547</xdp:xdp>"#
4548        )
4549    }
4550
4551    #[test]
4552    fn flatten_simple_form_produces_non_empty_content() {
4553        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
4554        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4555
4556        // Load the result and check the content stream is non-empty.
4557        let doc = Document::load_mem(&result).expect("load flattened PDF");
4558        let pages: Vec<ObjectId> = doc.page_iter().collect();
4559        assert!(!pages.is_empty(), "flattened PDF has no pages");
4560
4561        // At least one page should have a non-empty content stream.
4562        let mut found_content = false;
4563        for page_id in &pages {
4564            if let Ok(page_dict) = doc.get_dictionary(*page_id) {
4565                if let Ok(contents_ref) = page_dict.get(b"Contents") {
4566                    if let Object::Reference(stream_id) = contents_ref {
4567                        if let Ok(obj) = doc.get_object(*stream_id) {
4568                            if let Ok(stream) = obj.as_stream() {
4569                                if !stream.content.is_empty() {
4570                                    found_content = true;
4571                                }
4572                            }
4573                        }
4574                    }
4575                }
4576            }
4577        }
4578        assert!(found_content, "all content streams are empty after flatten");
4579    }
4580
4581    #[test]
4582    fn flatten_reports_best_effort_for_xfa_javascript_event() {
4583        let pdf_bytes = build_xfa_pdf(JS_EVENT_XDP);
4584
4585        let (flattened, metadata) =
4586            flatten_xfa_to_pdf_with_metadata(&pdf_bytes).expect("flatten should skip JS");
4587
4588        assert!(!flattened.is_empty());
4589        assert_eq!(metadata.output_quality, OutputQuality::BestEffort);
4590        assert!(metadata.dynamic_scripts.js_present);
4591        assert_eq!(metadata.dynamic_scripts.js_skipped, 1);
4592    }
4593
4594    #[test]
4595    fn flatten_strips_catalog_open_action_javascript() {
4596        let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
4597        {
4598            let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
4599            let root_id = match doc.trailer.get(b"Root") {
4600                Ok(Object::Reference(id)) => *id,
4601                _ => panic!("no Root in test PDF"),
4602            };
4603            if let Ok(Object::Dictionary(catalog)) = doc.get_object_mut(root_id) {
4604                catalog.set(
4605                    "OpenAction",
4606                    Object::Dictionary(dictionary! {
4607                        "S" => Object::Name(b"JavaScript".to_vec()),
4608                        "JS" => Object::String(
4609                            b"app.alert('blocked')".to_vec(),
4610                            lopdf::StringFormat::Literal,
4611                        ),
4612                    }),
4613                );
4614            }
4615            let mut out = Vec::new();
4616            doc.save_to(&mut out).expect("save test PDF");
4617            pdf_bytes = out;
4618        }
4619
4620        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4621        let doc = Document::load_mem(&flattened).expect("load flattened PDF");
4622        let root_id = match doc.trailer.get(b"Root") {
4623            Ok(Object::Reference(id)) => *id,
4624            _ => panic!("no Root in flattened PDF"),
4625        };
4626        let catalog = doc.get_dictionary(root_id).expect("catalog dict");
4627        assert!(
4628            catalog.get(b"OpenAction").is_err(),
4629            "/OpenAction JavaScript must be stripped from flattened output"
4630        );
4631    }
4632
4633    /// Tests the canonical XFA nesting: <subform layout="paginate"> wraps
4634    /// <pageSet> + lr-tb content rows.  Verifies the flatten produces a single
4635    /// page with visible field content (border operators in the content stream).
4636    /// Before the extract_page_structure fix this produced 2 pages: page 1
4637    /// was blank (pageSet occupied 792pt) and page 2 had the actual fields.
4638    #[test]
4639    fn flatten_paginate_subform_with_nested_pageset_produces_visible_content() {
4640        const LR_TB_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
4641<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4642<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
4643  <subform name="form1" layout="paginate" locale="en_US">
4644    <pageSet>
4645      <pageArea name="Page1" id="Page1">
4646        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4647        <medium stock="default" short="8.5in" long="11in"/>
4648      </pageArea>
4649    </pageSet>
4650    <subform name="row1" layout="lr-tb" w="7.5in" h="0.4in">
4651      <field name="firstName" w="3.5in" h="0.4in">
4652        <caption><value><text>First</text></value></caption>
4653        <ui><textEdit/></ui>
4654        <value><text>John</text></value>
4655      </field>
4656      <field name="lastName" w="3.5in" h="0.4in">
4657        <caption><value><text>Last</text></value></caption>
4658        <ui><textEdit/></ui>
4659        <value><text>Doe</text></value>
4660      </field>
4661    </subform>
4662  </subform>
4663</template>
4664</xdp:xdp>"#;
4665
4666        let pdf_bytes = build_xfa_pdf(LR_TB_XDP);
4667        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4668
4669        let doc = Document::load_mem(&result).expect("load flattened PDF");
4670        let pages: Vec<ObjectId> = doc.page_iter().collect();
4671
4672        // Must produce exactly 1 page (not 2 as with the blank-first-page bug).
4673        assert_eq!(pages.len(), 1, "expected 1 page, got {}", pages.len());
4674
4675        // Page 1 must contain visible text operators from the field values.
4676        // (Fields with non-empty values produce WrappedText → BT/ET operators.)
4677        if let Ok(page_dict) = doc.get_dictionary(pages[0]) {
4678            if let Ok(lopdf::Object::Reference(stream_id)) = page_dict.get(b"Contents") {
4679                if let Ok(obj) = doc.get_object(*stream_id) {
4680                    if let Ok(stream) = obj.as_stream() {
4681                        let content = String::from_utf8_lossy(&stream.content);
4682                        assert!(
4683                            content.contains("BT\n"),
4684                            "no text operators in page 1 content stream (should have BT from field values)"
4685                        );
4686                        assert!(
4687                            content.contains("Tj\n"),
4688                            "no text show operators in page 1 content stream"
4689                        );
4690                    }
4691                }
4692            }
4693        }
4694    }
4695
4696    #[test]
4697    fn static_single_page_pdf_does_not_append_xfa_overflow_pages() {
4698        let xdp = overflowing_paginate_xdp(Some("interactiveForms"));
4699        let pdf_bytes = build_xfa_pdf(&xdp);
4700        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4701
4702        let doc = Document::load_mem(&result).expect("load flattened PDF");
4703        let pages: Vec<ObjectId> = doc.page_iter().collect();
4704
4705        assert_eq!(
4706            pages.len(),
4707            1,
4708            "static 1-page PDFs should preserve the original page when XFA layout over-paginates"
4709        );
4710    }
4711
4712    #[test]
4713    fn dynamic_single_page_pdf_can_expand_beyond_original_page_count() {
4714        // Dynamic XFA forms may ship with a single placeholder PDF page while
4715        // Adobe lays out multiple pages from the XFA data/template at runtime.
4716        // Flattening must therefore preserve the layout engine's page count
4717        // instead of clamping to the original PDF page count.
4718        let xdp = overflowing_paginate_xdp(None);
4719        let pdf_bytes = build_xfa_pdf(&xdp);
4720        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4721
4722        let doc = Document::load_mem(&result).expect("load flattened PDF");
4723        let pages: Vec<ObjectId> = doc.page_iter().collect();
4724
4725        assert_eq!(
4726            pages.len(),
4727            2,
4728            "dynamic 1-page PDFs should be allowed to grow when XFA layout paginates"
4729        );
4730    }
4731
4732    #[test]
4733    fn flatten_removes_acroform() {
4734        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
4735        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4736        let doc = Document::load_mem(&result).expect("load flattened PDF");
4737        let root_id = doc.trailer.get(b"Root").unwrap().as_reference().unwrap();
4738        let catalog = doc.get_dictionary(root_id).unwrap();
4739        assert!(
4740            catalog.get(b"AcroForm").is_err(),
4741            "/AcroForm still present after flatten"
4742        );
4743    }
4744
4745    #[test]
4746    fn flatten_non_xfa_pdf_unchanged() {
4747        // A PDF with no XFA should be returned as-is (no error).
4748        let mut doc = Document::with_version("1.4");
4749        let pages_id = doc.new_object_id();
4750        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4751            "Type"   => Object::Name(b"Page".to_vec()),
4752            "Parent" => Object::Reference(pages_id),
4753            "MediaBox" => Object::Array(vec![
4754                Object::Integer(0), Object::Integer(0),
4755                Object::Integer(612), Object::Integer(792),
4756            ])
4757        }));
4758        doc.objects.insert(
4759            pages_id,
4760            Object::Dictionary(dictionary! {
4761                "Type"  => Object::Name(b"Pages".to_vec()),
4762                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
4763                "Count" => Object::Integer(1)
4764            }),
4765        );
4766        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4767            "Type"  => Object::Name(b"Catalog".to_vec()),
4768            "Pages" => Object::Reference(pages_id)
4769        }));
4770        doc.trailer.set("Root", Object::Reference(catalog_id));
4771        let mut raw = Vec::new();
4772        doc.save_to(&mut raw).unwrap();
4773
4774        // flatten_xfa_to_pdf should return Ok (with the same bytes).
4775        let result = flatten_xfa_to_pdf(&raw).expect("flatten non-XFA failed");
4776        assert!(!result.is_empty());
4777    }
4778
4779    #[test]
4780    fn placeholder_only_page_does_not_trigger_static_strip_path() {
4781        const PLACEHOLDER_STREAM: &str = r#"BT
4782/Helv 24 Tf
478372 720 Td
4784(Please wait...) Tj
47850 -32 Td
4786(If this message is not eventually replaced by the proper contents of the document,) Tj
47870 -32 Td
4788(your PDF viewer may not be able to display this type of document.) Tj
47890 -32 Td
4790(You can upgrade to the latest version of Adobe Reader by visiting reader_download.) Tj
4791ET
4792"#;
4793
4794        let pdf_bytes =
4795            build_xfa_pdf_with_content(SIMPLE_XDP, PLACEHOLDER_STREAM.as_bytes().to_vec());
4796        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4797
4798        let doc = Document::load_mem(&result).expect("load flattened PDF");
4799        let page_id = doc.page_iter().next().expect("flattened page");
4800        let page_dict = doc.get_dictionary(page_id).expect("page dict");
4801        let contents_id = page_dict
4802            .get(b"Contents")
4803            .ok()
4804            .and_then(|object| object.as_reference().ok())
4805            .expect("contents ref");
4806        let stream = doc
4807            .get_object(contents_id)
4808            .expect("contents object")
4809            .as_stream()
4810            .expect("contents stream");
4811        let content = String::from_utf8_lossy(&stream.content);
4812
4813        assert!(
4814            content.contains("John"),
4815            "flattened page should contain XFA-rendered field content"
4816        );
4817        assert!(
4818            !content.contains("Please wait"),
4819            "placeholder text should not survive XFA flattening"
4820        );
4821    }
4822
4823    #[test]
4824    fn hybrid_static_pdf_uses_xfa_layout_over_static_content() {
4825        // When a PDF has both XFA template and static page content,
4826        // XFA layout should always take priority — the static content
4827        // may be a pre-rendered preview with wrong page count (#744).
4828        let appearance = Object::Stream(Stream::new(
4829            dictionary! {
4830                "Type" => Object::Name(b"XObject".to_vec()),
4831                "Subtype" => Object::Name(b"Form".to_vec()),
4832                "BBox" => Object::Array(vec![
4833                    Object::Integer(0), Object::Integer(0),
4834                    Object::Integer(120), Object::Integer(30),
4835                ]),
4836                "Matrix" => Object::Array(vec![
4837                    Object::Integer(1), Object::Integer(0),
4838                    Object::Integer(0), Object::Integer(1),
4839                    Object::Integer(0), Object::Integer(0),
4840                ]),
4841                "Resources" => Object::Dictionary(dictionary! {}),
4842            },
4843            b"0 G\n0.5 0.5 119 29 re\ns\n".to_vec(),
4844        ));
4845        // Enough Tj operators (≥5) to exceed the old static content threshold.
4846        let page_content = b"BT /F1 12 Tf 72 720 Td (Line 1) Tj 0 -14 Td (Line 2) Tj 0 -14 Td (Line 3) Tj 0 -14 Td (Line 4) Tj 0 -14 Td (Line 5) Tj ET\n".to_vec();
4847        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
4848            page_content,
4849            appearance,
4850            dictionary! {
4851                "FT" => Object::Name(b"Tx".to_vec()),
4852                "T" => Object::string_literal("field[0]"),
4853            },
4854        );
4855
4856        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4857        let doc = Document::load_mem(&result).expect("load flattened PDF");
4858        let page_id = doc.page_iter().next().expect("page");
4859        let page_dict = doc.get_dictionary(page_id).expect("page dict");
4860
4861        // XFA layout produces pages without widget annotations.
4862        assert!(
4863            page_dict.get(b"Annots").is_err(),
4864            "XFA-flattened page should have no annotations"
4865        );
4866    }
4867
4868    #[test]
4869    fn hybrid_static_pdf_uses_selected_button_appearance_state() {
4870        let yes_stream = Object::Stream(Stream::new(
4871            dictionary! {
4872                "Type" => Object::Name(b"XObject".to_vec()),
4873                "Subtype" => Object::Name(b"Form".to_vec()),
4874                "BBox" => Object::Array(vec![
4875                    Object::Integer(0), Object::Integer(0),
4876                    Object::Integer(20), Object::Integer(20),
4877                ]),
4878                "Matrix" => Object::Array(vec![
4879                    Object::Integer(1), Object::Integer(0),
4880                    Object::Integer(0), Object::Integer(1),
4881                    Object::Integer(0), Object::Integer(0),
4882                ]),
4883                "Resources" => Object::Dictionary(dictionary! {}),
4884            },
4885            b"BT /F1 8 Tf 1 1 Td (YES) Tj ET\n".to_vec(),
4886        ));
4887        let off_stream = Object::Stream(Stream::new(
4888            dictionary! {
4889                "Type" => Object::Name(b"XObject".to_vec()),
4890                "Subtype" => Object::Name(b"Form".to_vec()),
4891                "BBox" => Object::Array(vec![
4892                    Object::Integer(0), Object::Integer(0),
4893                    Object::Integer(20), Object::Integer(20),
4894                ]),
4895                "Matrix" => Object::Array(vec![
4896                    Object::Integer(1), Object::Integer(0),
4897                    Object::Integer(0), Object::Integer(1),
4898                    Object::Integer(0), Object::Integer(0),
4899                ]),
4900                "Resources" => Object::Dictionary(dictionary! {}),
4901            },
4902            b"BT /F1 8 Tf 1 1 Td (OFF) Tj ET\n".to_vec(),
4903        ));
4904
4905        let mut doc = Document::with_version("1.4");
4906        let state_id = doc.add_object(Object::Dictionary(dictionary! {
4907            "Yes" => yes_stream,
4908            "Off" => off_stream,
4909        }));
4910        let annot = dictionary! {
4911            "Subtype" => Object::Name(b"Widget".to_vec()),
4912            "Rect" => Object::Array(vec![
4913                Object::Integer(100), Object::Integer(700),
4914                Object::Integer(120), Object::Integer(720),
4915            ]),
4916            "AP" => Object::Dictionary(dictionary! {
4917                "N" => Object::Reference(state_id),
4918            }),
4919            "AS" => Object::Name(b"Yes".to_vec()),
4920            "FT" => Object::Name(b"Btn".to_vec()),
4921        };
4922        let ap_id =
4923            resolve_widget_normal_appearance(&mut doc, &annot).expect("selected normal appearance");
4924        let stream = doc
4925            .get_object(ap_id)
4926            .expect("appearance stream")
4927            .as_stream()
4928            .expect("appearance stream");
4929        let content = String::from_utf8_lossy(&stream.content);
4930
4931        assert!(
4932            content.contains("YES"),
4933            "flatten should choose the selected normal appearance state"
4934        );
4935    }
4936
4937    #[test]
4938    fn widget_as_off_without_off_appearance_returns_none() {
4939        // When /AS is "Off" but the Normal appearance dict has no "Off" key,
4940        // the widget is deselected. Returning None avoids baking a checked
4941        // mark from the only remaining on-state appearance.
4942        let yes_stream = Object::Stream(Stream::new(
4943            dictionary! {
4944                "Type" => Object::Name(b"XObject".to_vec()),
4945                "Subtype" => Object::Name(b"Form".to_vec()),
4946                "BBox" => Object::Array(vec![
4947                    Object::Integer(0), Object::Integer(0),
4948                    Object::Integer(10), Object::Integer(10),
4949                ]),
4950            },
4951            b"q 5 5 m 5 5 l S Q\n".to_vec(),
4952        ));
4953
4954        let mut doc = Document::with_version("1.4");
4955        // Normal appearance has only a "0" key (checked state), no "Off" key.
4956        let state_id = doc.add_object(Object::Dictionary(dictionary! {
4957            "0" => yes_stream,
4958        }));
4959        let annot = dictionary! {
4960            "Subtype" => Object::Name(b"Widget".to_vec()),
4961            "Rect" => Object::Array(vec![
4962                Object::Integer(100), Object::Integer(700),
4963                Object::Integer(110), Object::Integer(710),
4964            ]),
4965            "AP" => Object::Dictionary(dictionary! {
4966                "N" => Object::Reference(state_id),
4967            }),
4968            "AS" => Object::Name(b"Off".to_vec()),
4969            "FT" => Object::Name(b"Btn".to_vec()),
4970        };
4971        assert!(
4972            resolve_widget_normal_appearance(&mut doc, &annot).is_none(),
4973            "Off state with no Off appearance should not resolve to the on-state stream"
4974        );
4975    }
4976
4977    #[test]
4978    fn bake_checkbox_radio_ap_marks_skips_off_widgets_without_off_normal_appearance() {
4979        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
4980            Vec::new(),
4981            Object::Dictionary(dictionary! {
4982                "1" => Object::Stream(Stream::new(
4983                    dictionary! {
4984                        "Type" => Object::Name(b"XObject".to_vec()),
4985                        "Subtype" => Object::Name(b"Form".to_vec()),
4986                        "BBox" => Object::Array(vec![
4987                            Object::Integer(0), Object::Integer(0),
4988                            Object::Integer(10), Object::Integer(10),
4989                        ]),
4990                        "Resources" => Object::Dictionary(dictionary! {}),
4991                    },
4992                    b"q 1 1 8 8 re W n 2 8 m 8 2 l 8 8 m 2 2 l s Q\n".to_vec(),
4993                )),
4994            }),
4995            dictionary! {
4996                "FT" => Object::Name(b"Btn".to_vec()),
4997                "AS" => Object::Name(b"Off".to_vec()),
4998                "T" => Object::string_literal("checkbox[0]"),
4999            },
5000        );
5001
5002        let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5003        let page_id = doc.page_iter().next().expect("page");
5004        let baked = bake_checkbox_radio_ap_marks(&mut doc, page_id);
5005
5006        assert_eq!(baked, 0, "Off-state widget must not stamp the on-mark");
5007    }
5008
5009    #[test]
5010    fn adding_widget_xobject_preserves_indirect_inline_page_xobjects() {
5011        let mut doc = Document::with_version("1.4");
5012        let existing_xobject_id = doc.add_object(Object::Stream(Stream::new(
5013            dictionary! {
5014                "Type" => Object::Name(b"XObject".to_vec()),
5015                "Subtype" => Object::Name(b"Form".to_vec()),
5016                "BBox" => Object::Array(vec![
5017                    Object::Integer(0), Object::Integer(0),
5018                    Object::Integer(10), Object::Integer(10),
5019                ]),
5020            },
5021            b"q Q\n".to_vec(),
5022        )));
5023        let xobject_dict_id = doc.add_object(Object::Dictionary(dictionary! {
5024            "R11" => Object::Reference(existing_xobject_id),
5025        }));
5026
5027        let pages_id = doc.new_object_id();
5028        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5029            "Type" => Object::Name(b"Page".to_vec()),
5030            "Parent" => Object::Reference(pages_id),
5031            "MediaBox" => Object::Array(vec![
5032                Object::Integer(0), Object::Integer(0),
5033                Object::Integer(612), Object::Integer(792),
5034            ]),
5035            "Resources" => Object::Dictionary(dictionary! {
5036                "XObject" => Object::Reference(xobject_dict_id),
5037            }),
5038        }));
5039        doc.objects.insert(
5040            pages_id,
5041            Object::Dictionary(dictionary! {
5042                "Type"  => Object::Name(b"Pages".to_vec()),
5043                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5044                "Count" => Object::Integer(1)
5045            }),
5046        );
5047
5048        let new_xobject_id = doc.add_object(Object::Stream(Stream::new(
5049            dictionary! {
5050                "Type" => Object::Name(b"XObject".to_vec()),
5051                "Subtype" => Object::Name(b"Form".to_vec()),
5052                "BBox" => Object::Array(vec![
5053                    Object::Integer(0), Object::Integer(0),
5054                    Object::Integer(10), Object::Integer(10),
5055                ]),
5056            },
5057            b"0 0 10 10 re S\n".to_vec(),
5058        )));
5059
5060        add_xobject_to_page_resources(&mut doc, page_id, "XfaAp0", new_xobject_id);
5061
5062        let xobjects = doc
5063            .get_object(xobject_dict_id)
5064            .expect("xobject dict")
5065            .as_dict()
5066            .expect("xobject dict");
5067        assert!(
5068            xobjects.get(b"R11").is_ok(),
5069            "existing page XObject was lost"
5070        );
5071        assert!(
5072            xobjects.get(b"XfaAp0").is_ok(),
5073            "new flattened widget XObject was not added"
5074        );
5075    }
5076
5077    #[test]
5078    fn encrypted_pdf_without_xfa_returns_ok() {
5079        // Encrypted PDF without AcroForm/XFA → returned as-is (no XFA to flatten).
5080        let mut doc = Document::with_version("1.4");
5081        let pages_id = doc.new_object_id();
5082        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5083            "Type"     => Object::Name(b"Page".to_vec()),
5084            "Parent"   => Object::Reference(pages_id),
5085            "MediaBox" => Object::Array(vec![
5086                Object::Integer(0), Object::Integer(0),
5087                Object::Integer(612), Object::Integer(792),
5088            ]),
5089        }));
5090        doc.objects.insert(
5091            pages_id,
5092            Object::Dictionary(dictionary! {
5093                "Type"  => Object::Name(b"Pages".to_vec()),
5094                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5095                "Count" => Object::Integer(1),
5096            }),
5097        );
5098        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5099            "Type"  => Object::Name(b"Catalog".to_vec()),
5100            "Pages" => Object::Reference(pages_id),
5101        }));
5102        doc.trailer.set("Root", Object::Reference(catalog_id));
5103
5104        let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5105            "Filter" => Object::Name(b"Standard".to_vec()),
5106            "V"      => Object::Integer(2),
5107            "Length"  => Object::Integer(128),
5108        }));
5109        doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5110
5111        let mut buf = Vec::new();
5112        doc.save_to(&mut buf).expect("save test PDF");
5113
5114        let result = flatten_xfa_to_pdf(&buf);
5115        assert!(result.is_ok(), "non-XFA encrypted PDF should return Ok");
5116    }
5117
5118    #[test]
5119    fn encrypted_xfa_pdf_returns_encrypted_error() {
5120        // Encrypted PDF WITH AcroForm/XFA → should reach the decrypt check
5121        // and return Err(Encrypted) when the password is required.
5122        let mut doc = Document::with_version("1.4");
5123        let pages_id = doc.new_object_id();
5124        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5125            "Type"     => Object::Name(b"Page".to_vec()),
5126            "Parent"   => Object::Reference(pages_id),
5127            "MediaBox" => Object::Array(vec![
5128                Object::Integer(0), Object::Integer(0),
5129                Object::Integer(612), Object::Integer(792),
5130            ]),
5131        }));
5132        doc.objects.insert(
5133            pages_id,
5134            Object::Dictionary(dictionary! {
5135                "Type"  => Object::Name(b"Pages".to_vec()),
5136                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5137                "Count" => Object::Integer(1),
5138            }),
5139        );
5140        // Add AcroForm with XFA key so the byte-level pre-check passes.
5141        let xfa_stream_id = doc.add_object(Object::Stream(lopdf::Stream::new(
5142            dictionary! {},
5143            b"<xdp:xdp></xdp:xdp>".to_vec(),
5144        )));
5145        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
5146            "XFA" => Object::Reference(xfa_stream_id),
5147        }));
5148        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5149            "Type"     => Object::Name(b"Catalog".to_vec()),
5150            "Pages"    => Object::Reference(pages_id),
5151            "AcroForm" => Object::Reference(acroform_id),
5152        }));
5153        doc.trailer.set("Root", Object::Reference(catalog_id));
5154
5155        let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5156            "Filter" => Object::Name(b"Standard".to_vec()),
5157            "V"      => Object::Integer(2),
5158            "Length"  => Object::Integer(128),
5159        }));
5160        doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5161
5162        let mut buf = Vec::new();
5163        doc.save_to(&mut buf).expect("save encrypted PDF");
5164
5165        let result = flatten_xfa_to_pdf(&buf);
5166        assert!(result.is_err(), "expected Encrypted error");
5167        let err = result.unwrap_err();
5168        assert!(
5169            matches!(err, XfaError::Encrypted(_)),
5170            "expected XfaError::Encrypted, got: {err:?}"
5171        );
5172    }
5173
5174    #[test]
5175    fn owner_only_encrypted_pdf_is_handled_transparently() {
5176        // Owner-only encrypted PDFs (empty user password) are auto-decrypted by lopdf.
5177        // Verify that flatten_xfa_to_pdf processes them without error.
5178        let mut doc = Document::with_version("2.0");
5179        let pages_id = doc.new_object_id();
5180        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5181            "Type"     => Object::Name(b"Page".to_vec()),
5182            "Parent"   => Object::Reference(pages_id),
5183            "MediaBox" => Object::Array(vec![
5184                Object::Integer(0), Object::Integer(0),
5185                Object::Integer(612), Object::Integer(792),
5186            ]),
5187        }));
5188        doc.objects.insert(
5189            pages_id,
5190            Object::Dictionary(dictionary! {
5191                "Type"  => Object::Name(b"Pages".to_vec()),
5192                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5193                "Count" => Object::Integer(1),
5194            }),
5195        );
5196        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5197            "Type"  => Object::Name(b"Catalog".to_vec()),
5198            "Pages" => Object::Reference(pages_id),
5199        }));
5200        doc.trailer.set("Root", Object::Reference(catalog_id));
5201
5202        // Encrypt with owner password "secret", empty user password.
5203        let state = lopdf::aes256_encryption_state("secret", "", lopdf::Permissions::default())
5204            .expect("create encryption state");
5205        doc.encrypt(&state).expect("encrypt document");
5206
5207        let mut buf = Vec::new();
5208        doc.save_to(&mut buf).expect("save encrypted PDF");
5209
5210        // lopdf auto-decrypts owner-only encrypted PDFs, so is_pdf_encrypted returns false.
5211        assert!(
5212            !is_pdf_encrypted(&buf),
5213            "lopdf should auto-decrypt owner-only PDFs"
5214        );
5215
5216        // flatten_xfa_to_pdf should succeed — no XFA content, returns input as-is.
5217        let result = flatten_xfa_to_pdf(&buf);
5218        assert!(
5219            result.is_ok(),
5220            "owner-only encrypted PDF should be handled, got: {result:?}"
5221        );
5222    }
5223
5224    /// Build a minimal PDF with a Type0 (CID) font that has a /W array.
5225    fn build_pdf_with_cid_font(w_array: Vec<Object>, dw: Option<i64>) -> Document {
5226        let mut doc = Document::with_version("1.4");
5227
5228        // Minimal CIDFont descendant dictionary with /W
5229        let mut cid_dict = dictionary! {
5230            "Type"    => Object::Name(b"Font".to_vec()),
5231            "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
5232            "BaseFont" => Object::Name(b"TestFont".to_vec()),
5233            "W"       => Object::Array(w_array)
5234        };
5235        if let Some(dw_val) = dw {
5236            cid_dict.set("DW", Object::Integer(dw_val));
5237        }
5238        let cid_id = doc.add_object(Object::Dictionary(cid_dict));
5239
5240        // Type0 composite font pointing to the CIDFont
5241        let type0_dict = dictionary! {
5242            "Type"            => Object::Name(b"Font".to_vec()),
5243            "Subtype"         => Object::Name(b"Type0".to_vec()),
5244            "BaseFont"        => Object::Name(b"TestFont".to_vec()),
5245            "DescendantFonts" => Object::Array(vec![Object::Reference(cid_id)])
5246        };
5247        doc.add_object(Object::Dictionary(type0_dict));
5248        doc
5249    }
5250
5251    /// Test CID /W array parsing: consecutive widths format.
5252    /// /W [120 [500 600 700]] → CID 120=500, CID 121=600, CID 122=700
5253    #[test]
5254    fn cid_w_array_consecutive() {
5255        let w = vec![
5256            Object::Integer(120),
5257            Object::Array(vec![
5258                Object::Integer(500),
5259                Object::Integer(600),
5260                Object::Integer(700),
5261            ]),
5262        ];
5263        let doc = build_pdf_with_cid_font(w, None);
5264        let _fonts = extract_embedded_fonts(&doc);
5265
5266        // No font stream embedded, so extract_embedded_fonts won't find data.
5267        // Test the parser directly via the Type0 dict.
5268        for obj in doc.objects.values() {
5269            let dict = match obj.as_dict() {
5270                Ok(d) => d,
5271                Err(_) => continue,
5272            };
5273            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5274            if subtype == Some(b"Type0".as_slice()) {
5275                let result = extract_cid_font_widths(&doc, dict);
5276                let (first, widths) = result.expect("should parse /W array");
5277                assert_eq!(first, 120);
5278                assert_eq!(widths.len(), 3);
5279                assert_eq!(widths[0], 500); // CID 120
5280                assert_eq!(widths[1], 600); // CID 121
5281                assert_eq!(widths[2], 700); // CID 122
5282                return;
5283            }
5284        }
5285        panic!("Type0 font not found in test document");
5286    }
5287
5288    /// Test CID /W array parsing: range format.
5289    /// /W [200 300 250] → CIDs 200-300 all have width 250
5290    #[test]
5291    fn cid_w_array_range() {
5292        let w = vec![
5293            Object::Integer(200),
5294            Object::Integer(300),
5295            Object::Integer(250),
5296        ];
5297        let doc = build_pdf_with_cid_font(w, None);
5298
5299        for obj in doc.objects.values() {
5300            let dict = match obj.as_dict() {
5301                Ok(d) => d,
5302                Err(_) => continue,
5303            };
5304            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5305            if subtype == Some(b"Type0".as_slice()) {
5306                let (first, widths) =
5307                    extract_cid_font_widths(&doc, dict).expect("should parse /W range");
5308                assert_eq!(first, 200);
5309                assert_eq!(widths.len(), 101); // 200..=300
5310                assert!(widths.iter().all(|&w| w == 250));
5311                return;
5312            }
5313        }
5314        panic!("Type0 font not found");
5315    }
5316
5317    /// Test CID /W array parsing: mixed consecutive + range formats.
5318    /// /W [120 [500 600 700] 200 300 250]
5319    /// CID 120=500, 121=600, 122=700, CIDs 200-300=250
5320    /// Default width (/DW) fills gaps (CIDs 123-199).
5321    #[test]
5322    fn cid_w_array_mixed() {
5323        let w = vec![
5324            Object::Integer(120),
5325            Object::Array(vec![
5326                Object::Integer(500),
5327                Object::Integer(600),
5328                Object::Integer(700),
5329            ]),
5330            Object::Integer(200),
5331            Object::Integer(300),
5332            Object::Integer(250),
5333        ];
5334        let doc = build_pdf_with_cid_font(w, Some(1000));
5335
5336        for obj in doc.objects.values() {
5337            let dict = match obj.as_dict() {
5338                Ok(d) => d,
5339                Err(_) => continue,
5340            };
5341            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5342            if subtype == Some(b"Type0".as_slice()) {
5343                let (first, widths) =
5344                    extract_cid_font_widths(&doc, dict).expect("should parse mixed /W");
5345                assert_eq!(first, 120);
5346                assert_eq!(widths.len(), 181); // 120..=300
5347                                               // Consecutive part
5348                assert_eq!(widths[0], 500); // CID 120
5349                assert_eq!(widths[1], 600); // CID 121
5350                assert_eq!(widths[2], 700); // CID 122
5351                                            // Gap filled with /DW=1000
5352                assert_eq!(widths[3], 1000); // CID 123
5353                assert_eq!(widths[79], 1000); // CID 199
5354                                              // Range part
5355                assert_eq!(widths[80], 250); // CID 200
5356                assert_eq!(widths[180], 250); // CID 300
5357                return;
5358            }
5359        }
5360        panic!("Type0 font not found");
5361    }
5362
5363    /// Test that /DW defaults to 1000 when not specified.
5364    #[test]
5365    fn cid_w_array_default_width() {
5366        let w = vec![
5367            Object::Integer(10),
5368            Object::Array(vec![Object::Integer(400)]),
5369            Object::Integer(20),
5370            Object::Array(vec![Object::Integer(600)]),
5371        ];
5372        let doc = build_pdf_with_cid_font(w, None); // no /DW → defaults to 1000
5373
5374        for obj in doc.objects.values() {
5375            let dict = match obj.as_dict() {
5376                Ok(d) => d,
5377                Err(_) => continue,
5378            };
5379            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5380            if subtype == Some(b"Type0".as_slice()) {
5381                let (first, widths) = extract_cid_font_widths(&doc, dict).expect("should parse /W");
5382                assert_eq!(first, 10);
5383                assert_eq!(widths[0], 400); // CID 10
5384                assert_eq!(widths[5], 1000); // CID 15 — default
5385                assert_eq!(widths[10], 600); // CID 20
5386                return;
5387            }
5388        }
5389        panic!("Type0 font not found");
5390    }
5391
5392    #[test]
5393    fn extract_embedded_fonts_keeps_simple_pdf_fonts_without_fontfile() {
5394        let mut doc = Document::new();
5395        let font_id = doc.add_object(Object::Dictionary(dictionary! {
5396            "Type" => Object::Name(b"Font".to_vec()),
5397            "Subtype" => Object::Name(b"Type1".to_vec()),
5398            "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
5399            "FirstChar" => Object::Integer(32),
5400            "LastChar" => Object::Integer(34),
5401            "Widths" => Object::Array(vec![
5402                Object::Integer(278),
5403                Object::Integer(333),
5404                Object::Integer(612),
5405            ]),
5406            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
5407        }));
5408
5409        let fonts = extract_embedded_fonts(&doc);
5410        let font = fonts
5411            .iter()
5412            .find(|font| font.name == "MyriadPro-Regular")
5413            .expect("expected reusable simple font");
5414
5415        assert!(font.data.is_empty(), "no FontFile* should keep data empty");
5416        assert_eq!(font.pdf_widths, Some((32, vec![278, 333, 612])));
5417        assert_eq!(
5418            font.pdf_source_font,
5419            Some(PdfSourceFont { object_id: font_id })
5420        );
5421    }
5422
5423    #[test]
5424    fn store_font_data_reserves_family_alias_for_regular_face() {
5425        let mut fonts = Vec::new();
5426        store_font_data(
5427            &mut fonts,
5428            "ArialMT",
5429            Vec::new(),
5430            Some((32, vec![278, 333, 611])),
5431            None,
5432            Some(PdfSourceFont { object_id: (1, 0) }),
5433        );
5434        store_font_data(
5435            &mut fonts,
5436            "Arial-BoldMT",
5437            Vec::new(),
5438            Some((32, vec![278, 333, 611])),
5439            None,
5440            Some(PdfSourceFont { object_id: (2, 0) }),
5441        );
5442        store_font_data(
5443            &mut fonts,
5444            "Arial-ItalicMT",
5445            Vec::new(),
5446            Some((32, vec![278, 333, 611])),
5447            None,
5448            Some(PdfSourceFont { object_id: (3, 0) }),
5449        );
5450
5451        let aliases: Vec<_> = fonts.iter().map(|font| font.name.as_str()).collect();
5452        assert!(aliases.contains(&"ArialMT"));
5453        assert!(aliases.contains(&"Arial-BoldMT"));
5454        assert!(aliases.contains(&"Arial-ItalicMT"));
5455        assert_eq!(
5456            aliases.iter().filter(|name| **name == "Arial").count(),
5457            1,
5458            "only the regular face should claim the bare family alias"
5459        );
5460    }
5461
5462    #[test]
5463    fn store_font_data_keeps_regular_ps_family_alias() {
5464        let mut fonts = Vec::new();
5465        store_font_data(
5466            &mut fonts,
5467            "MyriadPro-Regular",
5468            Vec::new(),
5469            Some((32, vec![278, 333, 612])),
5470            None,
5471            Some(PdfSourceFont { object_id: (4, 0) }),
5472        );
5473
5474        assert!(
5475            fonts.iter().any(|font| font.name == "Myriad Pro"),
5476            "regular PostScript names should still expose their family alias"
5477        );
5478    }
5479
5480    #[test]
5481    fn page_content_streams_resolves_indirect_contents_arrays() {
5482        let mut doc = Document::new();
5483        let stream_a = doc.add_object(Stream::new(
5484            dictionary! {"Length" => Object::Integer(8)},
5485            b"(A) Tj\n".to_vec(),
5486        ));
5487        let stream_b = doc.add_object(Stream::new(
5488            dictionary! {"Length" => Object::Integer(8)},
5489            b"(B) Tj\n".to_vec(),
5490        ));
5491        let contents_array = doc.add_object(Object::Array(vec![
5492            Object::Reference(stream_a),
5493            Object::Reference(stream_b),
5494        ]));
5495        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5496            "Type" => Object::Name(b"Page".to_vec()),
5497            "Contents" => Object::Reference(contents_array),
5498        }));
5499
5500        let streams = page_content_streams(&doc, page_id);
5501
5502        assert_eq!(
5503            streams.len(),
5504            2,
5505            "indirect /Contents arrays must be traversed"
5506        );
5507        assert!(streams[0].windows(2).any(|w| w == b"Tj"));
5508        assert!(streams[1].windows(2).any(|w| w == b"Tj"));
5509    }
5510
5511    #[test]
5512    fn embed_resolved_fonts_reuses_existing_pdf_font_object() {
5513        let mut doc = Document::new();
5514        let source_font_id = doc.add_object(Object::Dictionary(dictionary! {
5515            "Type" => Object::Name(b"Font".to_vec()),
5516            "Subtype" => Object::Name(b"Type1".to_vec()),
5517            "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
5518            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
5519        }));
5520        let before = doc.objects.len();
5521
5522        let mut resolved = HashMap::new();
5523        resolved.insert(
5524            "Myriad Pro_Normal_Normal".to_string(),
5525            ResolvedFont {
5526                name: "Myriad Pro".to_string(),
5527                data: Vec::new(),
5528                face_index: 0,
5529                units_per_em: 1000,
5530                ascender: 800,
5531                descender: -200,
5532                pdf_widths: Some((32, vec![278, 333, 612])),
5533                pdf_encoding: None,
5534                pdf_source_font: Some(PdfSourceFont {
5535                    object_id: source_font_id,
5536                }),
5537            },
5538        );
5539
5540        let empty_layout = LayoutDom { pages: vec![] };
5541        let (_font_map, font_objects, metrics_data) =
5542            embed_resolved_fonts(&mut doc, &resolved, &empty_layout);
5543
5544        assert_eq!(
5545            doc.objects.len(),
5546            before,
5547            "should not embed a new font object"
5548        );
5549        assert_eq!(font_objects.len(), 1);
5550        assert_eq!(font_objects[0].1, source_font_id);
5551        assert!(
5552            metrics_data["Myriad Pro_Normal_Normal"].font_data.is_none(),
5553            "reused simple fonts must keep WinAnsi text encoding"
5554        );
5555    }
5556
5557    #[test]
5558    fn strip_undefined_entities_preserves_raw_ampersands_in_processing_instructions() {
5559        let xml = r##"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><?renderCache.textRun 24 A. Adjustment & Location 0 1417 14917 0 0 0 "Myriad Pro" 0 0 18000 ISO-8859-1?><?renderCache.subset "Arial" 0 0 ISO-8859-1 "#$%&'()+,-./" ?><subform name="form1"><field name="A"/></subform></template>"##;
5560
5561        let stripped = strip_undefined_xml_entities(xml);
5562
5563        assert_eq!(
5564            stripped, xml,
5565            "raw ampersands inside processing instructions are valid and must survive sanitization"
5566        );
5567        roxmltree::Document::parse(&stripped)
5568            .expect("processing instructions must remain parseable");
5569    }
5570
5571    #[test]
5572    fn strip_undefined_entities_drops_only_true_named_entity_references() {
5573        let xml = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="form1"><draw name="D"><value><text>alpha &bogus; beta &#169; &amp; gamma</text></value></draw></subform></template>"#;
5574
5575        let stripped = strip_undefined_xml_entities(xml);
5576
5577        assert!(
5578            !stripped.contains("&bogus;"),
5579            "unknown named entities should still be removed for roxmltree compatibility"
5580        );
5581        assert!(stripped.contains("&#169;"));
5582        assert!(stripped.contains("&amp;"));
5583        roxmltree::Document::parse(&stripped).expect("sanitized XML should parse");
5584    }
5585
5586    /// Form DOM with more repeating instances than the template must expand
5587    /// the FormTree and populate field values.
5588    #[test]
5589    fn form_dom_expands_repeating_subform_instances() {
5590        use xfa_layout_engine::form::FormNodeType;
5591
5592        // Template: one Activity subform with bind=none, occur max=-1
5593        let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5594          <subform name="root" layout="tb">
5595            <pageSet><pageArea name="P1">
5596              <contentArea w="200mm" h="280mm"/>
5597              <medium short="210mm" long="297mm"/>
5598            </pageArea></pageSet>
5599            <subform name="body" layout="tb">
5600              <subform name="Items" layout="tb">
5601                <bind match="none"/>
5602                <subform name="Row" layout="tb">
5603                  <bind match="none"/>
5604                  <occur max="-1"/>
5605                  <field name="Label"><ui><textEdit/></ui></field>
5606                </subform>
5607              </subform>
5608            </subform>
5609          </subform>
5610        </template>"#;
5611
5612        // Form DOM: 3 Row instances with values
5613        let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
5614          <subform name="root">
5615            <subform name="body">
5616              <subform name="Items">
5617                <instanceManager name="_Row"/>
5618                <subform name="Row">
5619                  <field name="Label"><value><text>Alpha</text></value></field>
5620                </subform>
5621                <subform name="Row">
5622                  <field name="Label"><value><text>Beta</text></value></field>
5623                </subform>
5624                <subform name="Row">
5625                  <field name="Label"><value><text>Gamma</text></value></field>
5626                </subform>
5627              </subform>
5628            </subform>
5629          </subform>
5630        </form>"#;
5631
5632        let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
5633        let merger = crate::merger::FormMerger::new(&data_dom);
5634        let (mut tree, root_id) = merger.merge(template).unwrap();
5635
5636        // Before form DOM: only 1 Row instance
5637        // Dump tree to understand structure
5638        fn find_by_name(tree: &FormTree, parent: FormNodeId, name: &str) -> Option<FormNodeId> {
5639            for &c in &tree.get(parent).children {
5640                if tree.get(c).name == name {
5641                    return Some(c);
5642                }
5643                if let Some(found) = find_by_name(tree, c, name) {
5644                    return Some(found);
5645                }
5646            }
5647            None
5648        }
5649        let items_id =
5650            find_by_name(&tree, root_id, "Items").expect("Items subform not found in tree");
5651        let rows_before = tree
5652            .get(items_id)
5653            .children
5654            .iter()
5655            .filter(|&&c| tree.get(c).name == "Row")
5656            .count();
5657        assert_eq!(
5658            rows_before, 1,
5659            "template merge should produce 1 Row (bind=none)"
5660        );
5661
5662        // Apply form DOM
5663        apply_form_dom_presence(&mut tree, root_id, form_xml);
5664
5665        // After form DOM: 3 Row instances with correct values
5666        let rows_after: Vec<FormNodeId> = tree
5667            .get(items_id)
5668            .children
5669            .iter()
5670            .filter(|&&c| tree.get(c).name == "Row")
5671            .copied()
5672            .collect();
5673        assert_eq!(
5674            rows_after.len(),
5675            3,
5676            "form DOM should expand to 3 Row instances"
5677        );
5678
5679        let values: Vec<String> = rows_after
5680            .iter()
5681            .map(|&row_id| {
5682                let label_id = tree.get(row_id).children[0];
5683                match &tree.get(label_id).node_type {
5684                    FormNodeType::Field { value } => value.clone(),
5685                    _ => String::new(),
5686                }
5687            })
5688            .collect();
5689        assert_eq!(values, vec!["Alpha", "Beta", "Gamma"]);
5690    }
5691
5692    // GL-QA36: verify the re-entrance guard prevents infinite recursion.
5693    //
5694    // We call flatten_xfa_to_pdf_simulate_reentrant (a #[cfg(test)] helper
5695    // that sets FLATTEN_DEPTH=1 before calling) to avoid accessing the
5696    // thread-local directly from the test sub-module.
5697    #[test]
5698    fn flatten_xfa_to_pdf_recursion_guard_returns_error() {
5699        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5700        let result = flatten_xfa_to_pdf_simulate_reentrant(&pdf_bytes);
5701        assert!(
5702            result.is_err(),
5703            "expected recursion guard to return Err, got Ok"
5704        );
5705        let err_msg = result.unwrap_err().to_string();
5706        assert!(
5707            err_msg.contains("recursively"),
5708            "expected error message to mention recursion, got: {err_msg}"
5709        );
5710    }
5711
5712    // GL-QA36: verify the depth counter is reset to 0 after a normal call so
5713    // subsequent calls on the same thread are not falsely blocked.
5714    #[test]
5715    fn flatten_xfa_to_pdf_depth_counter_resets_after_call() {
5716        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5717        // First call; should succeed and reset depth to 0.
5718        let _ = flatten_xfa_to_pdf(&pdf_bytes);
5719        // Second call must not be blocked by a leaked counter.
5720        let pdf_bytes2 = build_xfa_pdf(SIMPLE_XDP);
5721        let result = flatten_xfa_to_pdf(&pdf_bytes2);
5722        assert!(
5723            result.is_ok(),
5724            "second flatten call should succeed, got: {result:?}"
5725        );
5726    }
5727
5728    // XFA-F1-05 (issue #1088): flatten_xfa_to_pdf must never panic on empty input.
5729    // An empty byte slice is not a valid PDF, so the function should return an
5730    // error rather than panic or crash.
5731    #[test]
5732    fn flatten_xfa_to_pdf_does_not_panic_on_empty_input() {
5733        let result = flatten_xfa_to_pdf(&[]);
5734        // We only require it does not panic; an Err is perfectly acceptable.
5735        // (An Ok result would mean the PDF library accepts empty bytes, which
5736        //  would also be fine — the important invariant is no panic/abort.)
5737        let _ = result;
5738    }
5739
5740    // -----------------------------------------------------------------------
5741    // XFA-F6-01 (#1109): Pipeline contract — minimal well-formed XFA PDF
5742    // -----------------------------------------------------------------------
5743
5744    /// XFA-F6-01: the flatten pipeline completes without panicking on a
5745    /// minimal well-formed XFA PDF. This exercises all pipeline stages and
5746    /// verifies the debug_assert ordering constraints hold.
5747    #[test]
5748    fn flatten_pipeline_completes_on_minimal_xfa_pdf() {
5749        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5750        // The pipeline must not panic (debug_assert violations would panic in
5751        // debug builds). We do not require Ok — layout failure → static_fallback
5752        // is acceptable, the important invariant is no panic.
5753        let result = flatten_xfa_to_pdf(&pdf_bytes);
5754        let _ = result; // Ok or Err both acceptable; panic is not
5755    }
5756
5757    #[test]
5758    fn flatten_with_layout_dump_preserves_pdf_bytes() {
5759        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5760        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("plain flatten should succeed");
5761        let (flattened_with_dump, layout_dump) =
5762            flatten_xfa_to_pdf_with_layout_dump(&pdf_bytes).expect("dump flatten should succeed");
5763
5764        assert_eq!(flattened_with_dump, flattened);
5765        assert!(!layout_dump.pages.is_empty());
5766        assert_eq!(layout_dump.pages[0].page_num, 1);
5767        assert!(layout_dump.pages[0].used_height <= layout_dump.pages[0].page_height);
5768    }
5769
5770    // -----------------------------------------------------------------------
5771    // XFA-F6-02 (#1110): AcroForm/XFA removal tests
5772    // -----------------------------------------------------------------------
5773
5774    /// After flattening an XFA PDF, the output must not contain /NeedsRendering.
5775    #[test]
5776    fn flatten_removes_needs_rendering() {
5777        // Build a PDF with NeedsRendering in the catalog.
5778        let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5779        // Insert NeedsRendering into the catalog via lopdf.
5780        {
5781            let mut doc = Document::load_mem(&pdf_bytes).expect("parse for NeedsRendering test");
5782            let root_id = match doc.trailer.get(b"Root") {
5783                Ok(Object::Reference(id)) => *id,
5784                _ => panic!("no Root in trailer"),
5785            };
5786            if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
5787                dict.set("NeedsRendering", Object::Boolean(true));
5788            }
5789            let mut out = Vec::new();
5790            doc.save_to(&mut out)
5791                .expect("re-save for NeedsRendering test");
5792            pdf_bytes = out;
5793        }
5794
5795        // Flatten should strip NeedsRendering.
5796        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5797        let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
5798        let root_id = match doc.trailer.get(b"Root") {
5799            Ok(Object::Reference(id)) => *id,
5800            _ => panic!("no Root in flattened trailer"),
5801        };
5802        let catalog = doc.get_dictionary(root_id).expect("catalog dict");
5803        assert!(
5804            catalog.get(b"NeedsRendering").is_err(),
5805            "/NeedsRendering must be absent after flatten"
5806        );
5807    }
5808
5809    /// After flattening an XFA PDF, the output must not contain /XFA anywhere
5810    /// in the catalog or AcroForm dictionary.
5811    #[test]
5812    fn flatten_removes_xfa_entry() {
5813        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5814        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5815
5816        // Search the serialised bytes for /XFA — the key must not appear.
5817        // We look for " /XFA" / "\n/XFA" patterns in the raw output.
5818        let flattened_str = String::from_utf8_lossy(&flattened);
5819        assert!(
5820            !flattened_str.contains("/XFA"),
5821            "/XFA must be absent from flattened output, but was found"
5822        );
5823    }
5824
5825    #[test]
5826    fn remove_acroform_purges_xfa_packet_objects() {
5827        let (mut doc, acroform_id, xfa_ids) = build_xfa_doc_with_xfa_array();
5828
5829        remove_acroform(&mut doc);
5830
5831        assert!(
5832            !doc.objects.contains_key(&acroform_id),
5833            "AcroForm object should be removed from doc.objects"
5834        );
5835        for xfa_id in &xfa_ids {
5836            assert!(
5837                !doc.objects.contains_key(xfa_id),
5838                "XFA packet object {xfa_id:?} should be removed from doc.objects"
5839            );
5840        }
5841
5842        let mut out = Vec::new();
5843        doc.save_to(&mut out).expect("save cleaned PDF");
5844        let out_str = String::from_utf8_lossy(&out);
5845        assert!(
5846            !out_str.contains("xdp:xdp"),
5847            "serialized output should not contain orphaned XFA packet payloads"
5848        );
5849        assert!(
5850            !out_str.contains("<template"),
5851            "serialized output should not contain orphaned template payloads"
5852        );
5853    }
5854
5855    /// After flattening, there must be no empty /Annots arrays in the output.
5856    #[test]
5857    fn flatten_removes_empty_annots_arrays() {
5858        // Build a PDF with an empty Annots array on the page.
5859        let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5860        {
5861            let mut doc = Document::load_mem(&pdf_bytes).expect("parse for annots test");
5862            let page_id = doc.page_iter().next().expect("at least one page");
5863            if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
5864                dict.set("Annots", Object::Array(vec![]));
5865            }
5866            let mut out = Vec::new();
5867            doc.save_to(&mut out).expect("re-save for annots test");
5868            pdf_bytes = out;
5869        }
5870
5871        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5872        let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
5873        for page_id in doc.page_iter() {
5874            let page = doc.get_dictionary(page_id).expect("page dict");
5875            match page.get(b"Annots") {
5876                Ok(Object::Array(arr)) => {
5877                    assert!(
5878                        !arr.is_empty(),
5879                        "page {:?}: /Annots must either be absent or non-empty after flatten",
5880                        page_id
5881                    );
5882                }
5883                _ => {} // absent = good
5884            }
5885        }
5886    }
5887
5888    #[test]
5889    fn remove_acroform_strips_widgets_from_indirect_annots_arrays() {
5890        let appearance = Object::Stream(Stream::new(
5891            dictionary! {
5892                "Type" => Object::Name(b"XObject".to_vec()),
5893                "Subtype" => Object::Name(b"Form".to_vec()),
5894                "BBox" => Object::Array(vec![
5895                    Object::Integer(0), Object::Integer(0),
5896                    Object::Integer(20), Object::Integer(20),
5897                ]),
5898                "Resources" => Object::Dictionary(dictionary! {}),
5899            },
5900            b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
5901        ));
5902        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5903            Vec::new(),
5904            appearance,
5905            dictionary! {
5906                "FT" => Object::Name(b"Tx".to_vec()),
5907                "T" => Object::string_literal("field[0]"),
5908            },
5909        );
5910
5911        let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5912        let page_id = doc.page_iter().next().expect("page");
5913        let annots = page_annotations(&doc, page_id);
5914        let annots_id = doc.add_object(Object::Array(annots));
5915        if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
5916            page_dict.set("Annots", Object::Reference(annots_id));
5917        }
5918
5919        remove_acroform(&mut doc);
5920
5921        let page = doc.get_dictionary(page_id).expect("page dict");
5922        assert!(
5923            page.get(b"Annots").is_err(),
5924            "widget-only indirect /Annots must be removed"
5925        );
5926    }
5927
5928    #[test]
5929    fn acroform_without_xfa_falls_back_to_static_cleanup() {
5930        let appearance = Object::Stream(Stream::new(
5931            dictionary! {
5932                "Type" => Object::Name(b"XObject".to_vec()),
5933                "Subtype" => Object::Name(b"Form".to_vec()),
5934                "BBox" => Object::Array(vec![
5935                    Object::Integer(0), Object::Integer(0),
5936                    Object::Integer(20), Object::Integer(20),
5937                ]),
5938                "Resources" => Object::Dictionary(dictionary! {}),
5939            },
5940            b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
5941        ));
5942        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5943            Vec::new(),
5944            appearance,
5945            dictionary! {
5946                "FT" => Object::Name(b"Tx".to_vec()),
5947                "T" => Object::string_literal("field[0]"),
5948            },
5949        );
5950
5951        let mut doc = Document::load_mem(&pdf_bytes).expect("parse source PDF");
5952        let root_id = match doc.trailer.get(b"Root") {
5953            Ok(Object::Reference(id)) => *id,
5954            _ => panic!("no Root"),
5955        };
5956        let acroform_id = doc
5957            .get_dictionary(root_id)
5958            .expect("catalog")
5959            .get(b"AcroForm")
5960            .expect("AcroForm")
5961            .as_reference()
5962            .expect("AcroForm ref");
5963        if let Ok(Object::Dictionary(ref mut acroform)) = doc.get_object_mut(acroform_id) {
5964            acroform.remove(b"XFA");
5965        }
5966        let mut acroform_only = Vec::new();
5967        doc.save_to(&mut acroform_only)
5968            .expect("save AcroForm-only PDF");
5969
5970        let flattened = flatten_xfa_to_pdf(&acroform_only).expect("flatten failed");
5971        let flattened_doc = Document::load_mem(&flattened).expect("parse flattened PDF");
5972        let root_id = match flattened_doc.trailer.get(b"Root") {
5973            Ok(Object::Reference(id)) => *id,
5974            _ => panic!("no Root in flattened PDF"),
5975        };
5976        let catalog = flattened_doc
5977            .get_dictionary(root_id)
5978            .expect("flattened catalog");
5979        assert!(
5980            catalog.get(b"AcroForm").is_err(),
5981            "AcroForm-only PDFs should still be cleaned by flatten"
5982        );
5983
5984        let page_id = flattened_doc.page_iter().next().expect("flattened page");
5985        assert!(
5986            page_annotations(&flattened_doc, page_id).is_empty(),
5987            "flattened AcroForm-only PDFs should not retain widget annotations"
5988        );
5989    }
5990
5991    // -----------------------------------------------------------------------
5992    // XFA-F6-03 (#1111): validate_flattened_pdf tests
5993    // -----------------------------------------------------------------------
5994
5995    /// A clean (non-XFA) PDF must pass validation with no warnings.
5996    #[test]
5997    fn validate_flattened_pdf_clean_pdf_passes() {
5998        // Build the minimal PDF document (no AcroForm/XFA).
5999        let mut doc = Document::with_version("1.4");
6000        let pages_id = doc.new_object_id();
6001        let page_id = doc.add_object(Object::Dictionary(dictionary! {
6002            "Type"     => Object::Name(b"Page".to_vec()),
6003            "Parent"   => Object::Reference(pages_id),
6004            "MediaBox" => Object::Array(vec![
6005                Object::Integer(0), Object::Integer(0),
6006                Object::Integer(612), Object::Integer(792),
6007            ])
6008        }));
6009        doc.objects.insert(
6010            pages_id,
6011            Object::Dictionary(dictionary! {
6012                "Type"  => Object::Name(b"Pages".to_vec()),
6013                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
6014                "Count" => Object::Integer(1)
6015            }),
6016        );
6017        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
6018            "Type"  => Object::Name(b"Catalog".to_vec()),
6019            "Pages" => Object::Reference(pages_id)
6020        }));
6021        doc.trailer.set("Root", Object::Reference(catalog_id));
6022        let mut pdf_bytes = Vec::new();
6023        doc.save_to(&mut pdf_bytes).expect("save clean PDF");
6024
6025        let validation = validate_flattened_pdf(&pdf_bytes).expect("validate failed");
6026        assert!(
6027            validation.has_no_acroform,
6028            "clean PDF should have no AcroForm"
6029        );
6030        assert!(validation.has_no_xfa, "clean PDF should have no XFA");
6031        assert!(
6032            validation.has_no_needs_rendering,
6033            "clean PDF should have no NeedsRendering"
6034        );
6035        assert_eq!(validation.page_count, 1, "clean PDF should report 1 page");
6036        assert!(
6037            validation.warnings.is_empty(),
6038            "clean PDF should produce no warnings, got: {:?}",
6039            validation.warnings
6040        );
6041    }
6042
6043    /// validate_flattened_pdf must not panic on empty input.
6044    #[test]
6045    fn validate_flattened_pdf_does_not_panic_on_empty_input() {
6046        let result = validate_flattened_pdf(&[]);
6047        // Should return Ok with a warning, not panic.
6048        assert!(
6049            result.is_ok(),
6050            "expected Ok from empty input, got: {:?}",
6051            result.err()
6052        );
6053        let v = result.unwrap();
6054        assert_eq!(v.page_count, 0, "empty input has 0 pages");
6055        assert!(
6056            !v.warnings.is_empty(),
6057            "empty input should produce at least one warning"
6058        );
6059    }
6060
6061    // -----------------------------------------------------------------------
6062    // XFA-F6-04 (#1112): compare_flatten_quality tests
6063    // -----------------------------------------------------------------------
6064
6065    /// Page count comparison works correctly via compare_flatten_quality.
6066    #[test]
6067    fn compare_flatten_quality_page_count_comparison() {
6068        let original = build_xfa_pdf(SIMPLE_XDP);
6069        let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6070        let metrics =
6071            compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6072        // Both before and after must parse to at least 1 page.
6073        assert!(
6074            metrics.page_count_before >= 1,
6075            "original must have >= 1 page"
6076        );
6077        assert!(
6078            metrics.page_count_after >= 1,
6079            "flattened must have >= 1 page"
6080        );
6081        // page_count_match must reflect equality.
6082        assert_eq!(
6083            metrics.page_count_match,
6084            metrics.page_count_before == metrics.page_count_after,
6085            "page_count_match must equal page_count_before == page_count_after"
6086        );
6087    }
6088
6089    /// Content ratio is computed correctly.
6090    #[test]
6091    fn compare_flatten_quality_content_ratio_computed() {
6092        let original = build_xfa_pdf(SIMPLE_XDP);
6093        let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6094        let metrics =
6095            compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6096        // Ratio should be a finite positive number.
6097        assert!(
6098            metrics.content_ratio.is_finite() && metrics.content_ratio >= 0.0,
6099            "content_ratio must be finite and >= 0, got: {}",
6100            metrics.content_ratio
6101        );
6102        // Verify the ratio matches the raw values.
6103        let expected = if metrics.content_stream_bytes_before == 0 {
6104            1.0_f64
6105        } else {
6106            metrics.content_stream_bytes_after as f64 / metrics.content_stream_bytes_before as f64
6107        };
6108        assert!(
6109            (metrics.content_ratio - expected).abs() < 1e-9,
6110            "content_ratio mismatch: expected {expected}, got {}",
6111            metrics.content_ratio
6112        );
6113    }
6114
6115    // -----------------------------------------------------------------------
6116    // XFA-F7-02 (#1114): validate_text_completeness tests
6117    // -----------------------------------------------------------------------
6118
6119    /// validate_text_completeness returns completeness_ratio = 1.0 when the
6120    /// original XFA bytes have no datasets packet (nothing to check).
6121    #[test]
6122    fn validate_text_completeness_no_datasets_returns_perfect_ratio() {
6123        // Build an XFA PDF whose XDP has no <datasets> packet — just a template.
6124        let xdp = r#"<?xml version="1.0"?>
6125<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
6126  <template>
6127    <subform name="root">
6128      <field name="greeting"><ui><textEdit/></ui></field>
6129    </subform>
6130  </template>
6131</xdp:xdp>"#;
6132        let original = build_xfa_pdf(xdp);
6133        // Use a minimal clean PDF as the "flattened" output.
6134        let mut doc = Document::with_version("1.4");
6135        let pages_id = doc.new_object_id();
6136        let page_id = doc.add_object(Object::Dictionary(dictionary! {
6137            "Type"     => Object::Name(b"Page".to_vec()),
6138            "Parent"   => Object::Reference(pages_id),
6139            "MediaBox" => Object::Array(vec![
6140                Object::Integer(0), Object::Integer(0),
6141                Object::Integer(612), Object::Integer(792),
6142            ])
6143        }));
6144        doc.objects.insert(
6145            pages_id,
6146            Object::Dictionary(dictionary! {
6147                "Type"  => Object::Name(b"Pages".to_vec()),
6148                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
6149                "Count" => Object::Integer(1)
6150            }),
6151        );
6152        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
6153            "Type"  => Object::Name(b"Catalog".to_vec()),
6154            "Pages" => Object::Reference(pages_id)
6155        }));
6156        doc.trailer.set("Root", Object::Reference(catalog_id));
6157        let mut flattened = Vec::new();
6158        doc.save_to(&mut flattened).unwrap();
6159
6160        let result = validate_text_completeness(&original, &flattened)
6161            .expect("validate_text_completeness should not fail");
6162        assert!(
6163            result.expected_values.is_empty(),
6164            "no datasets packet means no expected values"
6165        );
6166        assert_eq!(
6167            result.completeness_ratio, 1.0,
6168            "empty expected set should yield ratio 1.0"
6169        );
6170    }
6171
6172    /// validate_text_completeness returns ratio 1.0 on empty inputs (no panic).
6173    #[test]
6174    fn validate_text_completeness_empty_inputs_do_not_panic() {
6175        let result = validate_text_completeness(&[], &[]);
6176        assert!(result.is_ok(), "should return Ok on empty inputs");
6177        let v = result.unwrap();
6178        assert_eq!(v.completeness_ratio, 1.0);
6179        assert!(v.expected_values.is_empty());
6180        assert!(v.missing_values.is_empty());
6181    }
6182
6183    // -----------------------------------------------------------------------
6184    // XFA-F9-03 (#1122): Debug logging — no panic/error on empty/non-XFA input
6185    // -----------------------------------------------------------------------
6186
6187    /// Calling `flatten_xfa_to_pdf` with completely empty input must not panic
6188    /// and must return an Ok (pass-through) or a well-formed Err.
6189    ///
6190    /// This also exercises the logging infrastructure: no log::error! calls
6191    /// should be emitted for inputs that simply have no XFA content.
6192    #[test]
6193    fn flatten_empty_bytes_does_not_panic_and_does_not_error() {
6194        // Empty byte slice: not a PDF, no XFA markers — should return Ok([])
6195        // or at worst a well-formed Err (not a panic).
6196        let result = flatten_xfa_to_pdf(b"");
6197        // We only assert it does not panic; Ok with empty bytes is acceptable.
6198        match result {
6199            Ok(_) => {}
6200            Err(_) => {} // Err is fine for invalid input
6201        }
6202    }
6203
6204    /// Non-XFA PDF bytes: flatten_xfa_to_pdf must return the input unchanged
6205    /// and must not emit any log errors.
6206    #[test]
6207    fn flatten_non_xfa_bytes_returns_input_unchanged() {
6208        // A trivial byte string that looks vaguely like PDF but has no /AcroForm
6209        // and no xdp:xdp — the pre-check at the start of flatten_xfa_to_pdf
6210        // should return immediately with the original bytes cloned.
6211        let input = b"%PDF-1.4\n%%EOF\n";
6212        let result = flatten_xfa_to_pdf(input);
6213        match result {
6214            Ok(out) => assert_eq!(out, input, "non-XFA input should pass through unchanged"),
6215            Err(_) => {} // Err is acceptable for degenerate input
6216        }
6217    }
6218}
pdf_xfa/flatten.rs

pdf_xfa/
flatten.rs