pdf_xfa/
flatten.rs

1//! # XFA Flattening Pipeline
2//!
3//! This module parses XFA template, runs layout, and writes PDF content streams.
4//!
5//! ## Pipeline Stages
6//!
7//! 1. **Extract** — `extract_embedded_fonts()` reads font programs and /Widths
8//!    arrays from PDF font dictionaries
9//! 2. **Store** — `store_font_data()` saves font bytes + widths keyed by name
10//! 3. **Resolve** — `XfaFontResolver::resolve()` matches XFA font names to
11//!    stored fonts, with fallbacks (alias, family, system)
12//! 4. **Inject** — `inject_resolved_metrics()` pushes resolved widths into
13//!    FontMetrics for the layout engine
14//! 5. **Layout** — `LayoutEngine::layout()` computes page positions using
15//!    resolved font metrics for accurate text measurement
16//! 6. **Render** — `generate_page_overlay()` in render_bridge converts LayoutDom
17//!    to PDF content stream operators
18//! 7. **Embed** — `embed_resolved_fonts()` writes font data into the PDF
19//!    and creates /Font resources
20//! 8. **Write** — The content streams are written back to PDF pages
21//!
22//! ## Static vs Dynamic Forms
23//!
24//! XFA Spec 3.3 §1.7 (p28-30):
25//! - **Static (XFAF)**: boilerplate in PDF, fields/subforms in XFA. Fixed layout.
26//! - **Dynamic (full XFA)**: all content in XFA. Layout computed at runtime.
27//! - `baseProfile="interactiveForms"` indicates static (XFAF) forms.
28//!
29//! XFA Spec 3.3 §2.9 (p72) — PDF-XFA Connection:
30//! - NeedsRendering flag: dynamic=true, XFAF=false.
31//! - XFA packets stored in AcroForm/XFA entry in catalog.
32//!
33//! ## /Widths Handling
34//!
35//! PDF /Widths arrays start at FirstChar (typically 32). For simple fonts we
36//! remap those code-indexed widths through the font encoding so the layout
37//! engine receives Unicode-indexed measurements.
38//!
39//! ## CID Font /W Arrays (PDF spec §9.7.4.3)
40//!
41//! CID fonts (Type0/composite) use `/W` arrays in the CIDFont descendant
42//! dictionary instead of simple `/Widths`. Two element types:
43//!   - `cid_start [w1 w2 ...]` — consecutive CIDs starting at cid_start
44//!   - `cid_first cid_last width` — range of CIDs with same width
45//!
46//! `/DW` (default width, defaults to 1000) covers CIDs not in `/W`.
47//!
48//! ## Known Limitations
49//!
50//! - CID-to-Unicode mapping (ToUnicode CMap) is not yet parsed
51//! - System font fallback may have different metrics than the PDF's embedded font
52
53use lopdf::{dictionary, Dictionary, Document, Object, ObjectId, Stream, StringFormat};
54use std::cell::Cell;
55use std::collections::{HashMap, HashSet};
56use std::fmt::Write as FmtWrite;
57#[cfg(not(target_arch = "wasm32"))]
58use std::thread;
59#[cfg(not(target_arch = "wasm32"))]
60use std::time::Duration;
61
62// GL-QA36: Re-entrance guard for flatten_xfa_to_pdf.
63//
64// When the XFA layout fails and static_fallback returns the original bytes
65// unchanged (because lopdf also cannot parse the file), a caller that retries
66// flatten on those same bytes will trigger the same failure path again,
67// causing infinite recursion and ultimately a stack overflow.
68//
69// This thread-local counter is incremented on entry to flatten_xfa_to_pdf and
70// decremented by a drop guard on exit.  If the counter is already ≥ 1 when
71// the function is entered, we return an error immediately to break the cycle.
72//
73// The counter is thread-local so the spawned worker thread (thread::spawn
74// inside flatten_xfa_to_pdf) starts with its own fresh counter = 0 and is
75// not affected by the caller's guard.
76thread_local! {
77    static FLATTEN_DEPTH: Cell<u32> = const { Cell::new(0) };
78}
79
80#[cfg(feature = "xfa-js-sandboxed")]
81use crate::dynamic::apply_dynamic_scripts_with_runtime;
82use crate::dynamic::{
83    apply_dynamic_scripts, apply_dynamic_scripts_with_mode, runtime_diag_enabled,
84    DynamicScriptOutcome, FormDomMatchEntry, JsExecutionMode, OutputQuality,
85};
86use crate::error::{Result, XfaError};
87use crate::extract::extract_xfa_from_bytes;
88use crate::flatten_trace;
89use crate::font_bridge::{
90    font_variant_key, pdf_glyph_name_to_unicode, CidFontInfo, EmbeddedFontData, PdfBaseEncoding,
91    PdfSimpleEncoding, PdfSourceFont, ResolvedFont, XfaFontResolver, XfaFontSpec,
92};
93use crate::image_bridge::embed_image;
94use crate::javascript_policy::{self, JavaScriptEntryPoint};
95use crate::merger::FormMerger;
96use crate::render_bridge::{
97    generate_all_overlays, generate_field_values_overlays, unicode_to_winansi, FontMetricsData,
98    PageOverlay, XfaRenderConfig,
99};
100use xfa_dom_resolver::data_dom::DataDom;
101use xfa_layout_engine::form::{DrawContent, FormNodeId, FormNodeStyle, FormTree};
102use xfa_layout_engine::layout::{
103    LayoutContent, LayoutDom, LayoutEngine, LayoutNode, LayoutProfile,
104};
105
106// ---------------------------------------------------------------------------
107// XFA-F6-01 (#1109): Pipeline stage ordering contract.
108//
109// The XFA flatten pipeline must execute stages in strict order:
110//   Extract → Bind → Layout → Render → Embed → Write → Cleanup
111//
112// `debug_assert!` calls at stage boundaries verify this order at runtime in
113// debug builds. The PipelineStage enum is Ord so comparisons are cheap.
114// ---------------------------------------------------------------------------
115
116/// Ordered pipeline stages for the XFA flatten process.
117///
118/// Stages must execute in ascending order. Use `debug_assert!` at each stage
119/// boundary to verify ordering in debug builds.
120#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
121enum PipelineStage {
122    Extract = 0,
123    Bind = 1,
124    Layout = 2,
125    Render = 3,
126    Embed = 4,
127    Write = 5,
128    Cleanup = 6,
129}
130
131fn create_minimal_pdf_document() -> Document {
132    let mut doc = Document::new();
133    let pages_id = doc.add_object(Object::Dictionary(dictionary! {
134        "Type" => Object::Name(b"Pages".to_vec()),
135        "Kids" => Object::Array(vec![]),
136        "Count" => Object::Integer(0)
137    }));
138    let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
139        "Type" => Object::Name(b"Catalog".to_vec()),
140        "Pages" => Object::Reference(pages_id)
141    }));
142    doc.trailer.set("Root", Object::Reference(catalog_id));
143    doc
144}
145
146/// Layout metadata emitted only for CLI diagnostics.
147#[derive(Debug, Clone, Default)]
148pub struct LayoutDump {
149    /// Per-page layout entries (one per rendered page).
150    pub pages: Vec<LayoutDumpEntry>,
151    /// Outcome of any dynamic script processing applied before layout.
152    pub dynamic_scripts: DynamicScriptOutcome,
153    /// Overall quality level of the flattened output.
154    pub output_quality: OutputQuality,
155}
156
157/// One page entry in the optional layout dump.
158#[derive(Debug, Clone)]
159pub struct LayoutDumpEntry {
160    /// 1-based page number.
161    pub page_num: u32,
162    /// Total height of the page area in points.
163    pub page_height: f64,
164    /// Height consumed by laid-out content on this page, in points.
165    pub used_height: f64,
166    /// True when content overflowed and continued on the next page.
167    pub overflow_to_next: bool,
168    /// Name of the first element that triggered overflow, if any.
169    pub first_overflow_element: Option<String>,
170}
171
172/// **D11/D12.** XFA flatten rendering policy — how PDFluent resolves a conflict
173/// between a PDF's embedded form DOM (Adobe Reader's saved runtime state) and a
174/// fresh `template + datasets` re-merge.
175///
176/// See `benchmarks/runs/xfa_enterprise_plan/d10_formdom_vs_remerge_policy/` for
177/// the decision record. The default is [`XfaRenderingPolicy::SavedStateFaithful`].
178///
179/// **D12 validation (2026-05-21):** `FreshMergeExperimental` was measured on a
180/// 9-doc target set and returned a GREEN verdict — no page-count regressions,
181/// `01de9ce4` recovered +80% text content. Corpus-scale measurement (D13) is
182/// pending. `FreshMergeExperimental` remains explicitly experimental and is not
183/// the production default.
184///
185/// PDFluent does **not** claim a single universal Adobe-parity mode: XFA
186/// rendering is policy-dependent (saved-state vs fresh-merge), mirroring Adobe's
187/// own static-vs-dynamic rendering distinction.
188#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
189pub enum XfaRenderingPolicy {
190    /// Honor the embedded form DOM — render the instance set the document was
191    /// last saved with; suppress template/data instances the form DOM did not
192    /// enumerate. This is the default, fully-supported, production behaviour.
193    #[default]
194    SavedStateFaithful,
195    /// **Experimental.** Ignore the saved form DOM for dynamic sections; admit
196    /// data-bound subforms the form DOM omitted. D12 validation (2026-05-21)
197    /// confirmed improvement on `01de9ce4` (+80% text, 20 recovered rows) with
198    /// no page-count regressions on 8 protected targets. Some targets (`13275420`,
199    /// `b0389682`) do admit extra nodes under this policy, so page counts may
200    /// change. Corpus-scale measurement (D13) is pending before production use.
201    /// Default behavior (`SavedStateFaithful`) is unchanged.
202    FreshMergeExperimental,
203}
204
205impl XfaRenderingPolicy {
206    /// Stable lowercase identifier for reports / trace / CLI
207    /// (`"saved_state_faithful"` | `"fresh_merge_experimental"`).
208    #[must_use]
209    pub const fn as_str(self) -> &'static str {
210        match self {
211            Self::SavedStateFaithful => "saved_state_faithful",
212            Self::FreshMergeExperimental => "fresh_merge_experimental",
213        }
214    }
215
216    /// Parse a CLI/API token (`"saved-state"` | `"fresh-merge"`, plus the
217    /// `as_str` forms). Returns `None` for an unrecognised token.
218    #[must_use]
219    pub fn from_token(token: &str) -> Option<Self> {
220        match token.trim().to_ascii_lowercase().as_str() {
221            "saved-state" | "saved_state" | "saved_state_faithful" | "savedstatefaithful" => {
222                Some(Self::SavedStateFaithful)
223            }
224            "fresh-merge"
225            | "fresh_merge"
226            | "fresh_merge_experimental"
227            | "freshmergeexperimental" => Some(Self::FreshMergeExperimental),
228            _ => None,
229        }
230    }
231
232    /// Whether this policy has an implementation. Both policies are implemented
233    /// and D12-validated (9-doc set, GREEN, 2026-05-21). `FreshMergeExperimental`
234    /// is still experimental — corpus-scale measurement (D13) pending before
235    /// production use.
236    #[must_use]
237    pub const fn is_supported(self) -> bool {
238        true
239    }
240}
241
242/// Lightweight metadata returned alongside the flattened PDF bytes.
243#[derive(Debug, Clone, PartialEq, Eq, Default)]
244pub struct FlattenMetadata {
245    /// Outcome of dynamic script processing applied during flattening.
246    pub dynamic_scripts: DynamicScriptOutcome,
247    /// Overall output quality level of the flattened result.
248    pub output_quality: OutputQuality,
249    /// **D11.** The rendering policy that produced this result.
250    pub rendering_policy: XfaRenderingPolicy,
251    /// **D12.** Count of `formdom_unmatched` nodes that were **admitted**
252    /// (not suppressed) under [`XfaRenderingPolicy::FreshMergeExperimental`].
253    /// These are data-bound, non-zero-instance, non-template-hidden subforms
254    /// that the form DOM omitted but the merger matched to data.
255    ///
256    /// Always 0 under `SavedStateFaithful`.  Under `FreshMergeExperimental`
257    /// a non-zero value means the output may differ from `SavedStateFaithful`.
258    ///
259    /// **D12-validated** (9-doc set, GREEN, 2026-05-21). On `01de9ce4` a value
260    /// of 20 confirmed 20 recovered purchase-order rows. Corpus-scale measurement
261    /// (D13) pending; treat non-zero values as informational until D13 completes.
262    pub fresh_merge_admitted_nodes: usize,
263}
264
265impl FlattenMetadata {
266    fn from_dynamic_scripts(dynamic_scripts: DynamicScriptOutcome) -> Self {
267        let output_quality = dynamic_scripts.output_quality;
268        Self {
269            dynamic_scripts,
270            output_quality,
271            rendering_policy: XfaRenderingPolicy::SavedStateFaithful,
272            fresh_merge_admitted_nodes: 0,
273        }
274    }
275}
276
277struct FlattenOutput {
278    pdf_bytes: Vec<u8>,
279    layout_dump: LayoutDump,
280    metadata: FlattenMetadata,
281}
282
283impl FlattenOutput {
284    fn new(
285        pdf_bytes: Vec<u8>,
286        mut layout_dump: LayoutDump,
287        dynamic_scripts: DynamicScriptOutcome,
288    ) -> Self {
289        let output_quality = dynamic_scripts.output_quality;
290        let metadata = FlattenMetadata::from_dynamic_scripts(dynamic_scripts.clone());
291        layout_dump.dynamic_scripts = dynamic_scripts;
292        layout_dump.output_quality = output_quality;
293        Self {
294            pdf_bytes,
295            layout_dump,
296            metadata,
297        }
298    }
299
300    fn without_dump(pdf_bytes: Vec<u8>) -> Self {
301        Self::new(
302            pdf_bytes,
303            LayoutDump::default(),
304            DynamicScriptOutcome::default(),
305        )
306    }
307}
308
309/// Returns `true` if the PDF bytes contain an `/Encrypt` entry in the trailer.
310pub fn is_pdf_encrypted(pdf_bytes: &[u8]) -> bool {
311    Document::load_mem(pdf_bytes)
312        .map(|doc| doc.trailer.get(b"Encrypt").is_ok())
313        .unwrap_or(false)
314}
315
316enum DecryptResult {
317    NotEncrypted,
318    Decrypted(Vec<u8>),
319    NeedsPassword,
320}
321
322/// Try to handle encryption: if not encrypted return as-is, if encrypted try
323/// empty password (owner-only encryption), otherwise report needs-password.
324fn try_decrypt_pdf(pdf_bytes: &[u8]) -> DecryptResult {
325    let mut doc = match Document::load_mem(pdf_bytes) {
326        Ok(d) => d,
327        Err(_) => return DecryptResult::NotEncrypted, // Can't parse — let downstream handle it
328    };
329
330    // lopdf auto-decrypts with empty password on load and removes /Encrypt.
331    // Use was_encrypted() to detect this — the original bytes are still encrypted
332    // and downstream parsers (pdf_syntax) can't read them.
333    if doc.was_encrypted() {
334        // Already decrypted by lopdf — save the decrypted document.
335        let mut buf = Vec::new();
336        match doc.save_to(&mut buf) {
337            Ok(()) => return DecryptResult::Decrypted(buf),
338            Err(_) => return DecryptResult::NeedsPassword,
339        }
340    }
341
342    if doc.trailer.get(b"Encrypt").is_ok() {
343        // /Encrypt present but lopdf couldn't auto-decrypt — try explicit empty password.
344        match Document::load_mem_with_password(pdf_bytes, "") {
345            Ok(mut decrypted_doc) => {
346                decrypted_doc.trailer.remove(b"Encrypt");
347                let mut buf = Vec::new();
348                match decrypted_doc.save_to(&mut buf) {
349                    Ok(()) => return DecryptResult::Decrypted(buf),
350                    Err(_) => return DecryptResult::NeedsPassword,
351                }
352            }
353            Err(_) => return DecryptResult::NeedsPassword,
354        }
355    }
356
357    DecryptResult::NotEncrypted
358}
359
360/// Returns `true` if the layout nodes contain at least one data-bearing field.
361///
362/// Checks the FormTree source node because the layout engine may emit
363/// `WrappedText` instead of `Field` for fields with content.
364///
365/// Non-data-bearing widgets are excluded so pages whose only interactive
366/// elements are decorative or structural are treated as static-only pages
367/// and are never suppressed by the page-drop heuristic:
368///
369/// * `Draw` elements are purely static content — text labels, images, lines.
370/// * `FieldKind::Signature` — a signature box carries no user-typed value.
371/// * `FieldKind::Button` — a push-button carries no data value by design.
372/// * `FieldKind::Barcode` — barcodes are presentation-only.
373fn page_has_fields(nodes: &[LayoutNode], tree: &FormTree) -> bool {
374    use xfa_layout_engine::form::{FieldKind, FormNodeType};
375    nodes.iter().any(|n| {
376        // Draw nodes (text labels, lines, images) are static content; they
377        // must never count as data fields for the page-suppression heuristic.
378        let is_data_field = matches!(tree.get(n.form_node).node_type, FormNodeType::Field { .. })
379            && !matches!(
380                tree.meta(n.form_node).field_kind,
381                FieldKind::Signature | FieldKind::Button | FieldKind::Barcode
382            );
383        is_data_field || page_has_fields(&n.children, tree)
384    })
385}
386
387/// Returns `true` if the layout nodes contain at least one field with a
388/// non-empty value.  Checks the FormTree source node because the layout
389/// engine converts non-empty field values to `WrappedText` for line-
390/// wrapping, making `LayoutContent::Field` unreliable for data detection.
391fn page_has_field_data(nodes: &[LayoutNode], tree: &FormTree) -> bool {
392    use xfa_layout_engine::form::FormNodeType;
393    nodes.iter().any(|n| {
394        matches!(
395            &tree.get(n.form_node).node_type,
396            FormNodeType::Field { value } if !value.is_empty()
397        ) || page_has_field_data(&n.children, tree)
398    })
399}
400
401/// Per-page field counts `(total, empty, nonempty)` for suppression tracing.
402fn page_field_counts(nodes: &[LayoutNode], tree: &FormTree) -> (usize, usize, usize) {
403    use xfa_layout_engine::form::FormNodeType;
404    let mut total = 0;
405    let mut empty = 0;
406    let mut nonempty = 0;
407    for n in nodes {
408        if let FormNodeType::Field { value } = &tree.get(n.form_node).node_type {
409            total += 1;
410            if value.trim().is_empty() {
411                empty += 1;
412            } else {
413                nonempty += 1;
414            }
415        }
416        let (t, e, ne) = page_field_counts(&n.children, tree);
417        total += t;
418        empty += e;
419        nonempty += ne;
420    }
421    (total, empty, nonempty)
422}
423
424/// Static (non-field) visible text characters on a page (draw text + loose/
425/// wrapped text not from a field value). Used only for suppression tracing.
426fn page_static_draw_chars(nodes: &[LayoutNode]) -> usize {
427    let mut total = 0usize;
428    for n in nodes {
429        match &n.content {
430            LayoutContent::Text(t) => total += t.chars().count(),
431            LayoutContent::Draw(DrawContent::Text(t)) => total += t.chars().count(),
432            LayoutContent::WrappedText {
433                lines, from_field, ..
434            } if !*from_field => {
435                total += lines.iter().map(|l| l.chars().count()).sum::<usize>();
436            }
437            _ => {}
438        }
439        total += page_static_draw_chars(&n.children);
440    }
441    total
442}
443
444/// Returns `true` when the page's layout nodes render any visible ink — a
445/// placed field (border/caption/value box), a static draw (line/rect/arc),
446/// non-empty draw/loose/wrapped text, or an image. A page whose nodes all
447/// carry `LayoutContent::None` or only whitespace renders nothing and is *not*
448/// visible.
449///
450/// This is the keep predicate for the [`suppression_trust_layout_enabled`]
451/// relaxation: it lets §4.3 keep data-empty pages that still draw content
452/// (trusting the layout page count) while still dropping truly-blank pages,
453/// rather than dropping every page whose fields lack a bound value.
454fn page_has_visible_content(nodes: &[LayoutNode]) -> bool {
455    nodes.iter().any(|n| {
456        let self_visible = match &n.content {
457            LayoutContent::None => false,
458            LayoutContent::Text(t) => !t.trim().is_empty(),
459            LayoutContent::WrappedText { lines, .. } => lines.iter().any(|l| !l.trim().is_empty()),
460            LayoutContent::Draw(DrawContent::Text(t)) => !t.trim().is_empty(),
461            // Lines, rectangles and arcs are visible structural ink.
462            LayoutContent::Draw(_) => true,
463            // A placed field renders its border/caption/value box even when
464            // the bound value is empty.
465            LayoutContent::Field { .. } => true,
466            LayoutContent::Image { .. } => true,
467        };
468        self_visible || page_has_visible_content(&n.children)
469    })
470}
471
472/// Opt-in (default-off) for `XFA_SUPPRESSION_TRUST_LAYOUT`: when set to a
473/// truthy value (`1`/`on`/`true`), §4.3 suppression trusts the layout page
474/// count and keeps every laid-out page that renders visible content
475/// ([`page_has_visible_content`]) instead of dropping data-empty pages.
476///
477/// This is the suppression half of the over-pagination/occur-instance fix.
478/// It is intentionally NOT default-on: until the layout engine stops
479/// over-producing pages for a handful of docs, trusting the layout count
480/// re-inflates those over-produced pages. The flag therefore stays off by
481/// default and pairs with the layout over-production milestone.
482fn suppression_trust_layout_enabled() -> bool {
483    matches!(std::env::var("XFA_SUPPRESSION_TRUST_LAYOUT"), Ok(v) if {
484        let v = v.trim();
485        v == "1" || v.eq_ignore_ascii_case("on") || v.eq_ignore_ascii_case("true")
486    })
487}
488
489/// BE-1 harvest-mode (default OFF). When set, the §4.3 `data_empty_dropped`
490/// page-suppression decision is taken against the **pre-JS** (data-bound) field
491/// values rather than the post-JS live tree. This preserves the static
492/// data-empty suppression under `XFA_JS_EXECUTION_MODE=sandboxed`: the runtime
493/// still applies structural intents (instanceManager / presence), but JS field
494/// population (`#items` list writes, value mutations) no longer keeps an
495/// otherwise data-empty page alive. Only meaningful when sandboxed JS actually
496/// mutates the tree; in the static default path pre-JS == live, so this is a
497/// no-op and the default binary stays byte-identical.
498fn harvest_mode_enabled() -> bool {
499    matches!(std::env::var("XFA_JS_HARVEST_MODE"), Ok(v) if {
500        let v = v.trim();
501        v == "1" || v.eq_ignore_ascii_case("on") || v.eq_ignore_ascii_case("true")
502    })
503}
504
505/// BE-1 harvest-mode: snapshot the ids of all `Field` nodes that carry a
506/// non-empty value at the moment of capture (taken pre-JS, right after the
507/// data merge). Used by [`page_has_field_data_snapshot`] so the suppression
508/// decision reflects data-bound emptiness, not JS-populated values.
509fn snapshot_nonempty_field_ids(tree: &FormTree) -> HashSet<FormNodeId> {
510    use xfa_layout_engine::form::FormNodeType;
511    let mut ids = HashSet::new();
512    for i in 0..tree.nodes.len() {
513        let id = FormNodeId(i);
514        if let FormNodeType::Field { value } = &tree.get(id).node_type {
515            if !value.trim().is_empty() {
516                ids.insert(id);
517            }
518        }
519    }
520    ids
521}
522
523/// BE-1 harvest-mode counterpart to [`page_has_field_data`]: a page "has field
524/// data" iff one of its nodes was a non-empty field in the pre-JS snapshot.
525fn page_has_field_data_snapshot(nodes: &[LayoutNode], snapshot: &HashSet<FormNodeId>) -> bool {
526    nodes.iter().any(|n| {
527        snapshot.contains(&n.form_node) || page_has_field_data_snapshot(&n.children, snapshot)
528    })
529}
530
531/// Sorted distinct FormNode ids referenced on a page — the "occur-instance
532/// signature". Two pages with identical signatures are repeated instances of
533/// the same template subtree (occur expansion reuses the template id).
534fn page_form_node_signature(nodes: &[LayoutNode], out: &mut Vec<usize>) {
535    for n in nodes {
536        out.push(n.form_node.0);
537        page_form_node_signature(&n.children, out);
538    }
539}
540
541/// Compute per-page suppression diagnostics mirroring the keep decision in the
542/// XFA §4.3 suppression block. Used only by the env-gated flatten trace.
543/// Build a child→parent index map over the FormTree (`parent[i]` = parent
544/// FormNodeId.0, or `usize::MAX` for roots). Used for occur-ancestor walks.
545fn build_parent_map(tree: &FormTree) -> Vec<usize> {
546    let mut parent = vec![usize::MAX; tree.nodes.len()];
547    for (pid, node) in tree.nodes.iter().enumerate() {
548        for &child in &node.children {
549            if child.0 < parent.len() {
550                parent[child.0] = pid;
551            }
552        }
553    }
554    parent
555}
556
557/// Count page nodes bound to a data node (`meta.bound_data_node.is_some()`).
558fn page_data_bound_count(nodes: &[LayoutNode], tree: &FormTree) -> usize {
559    let mut c = 0;
560    for n in nodes {
561        if tree.meta(n.form_node).bound_data_node.is_some() {
562            c += 1;
563        }
564        c += page_data_bound_count(&n.children, tree);
565    }
566    c
567}
568
569/// Nearest repeating-subform ancestor (`occur.is_repeating()`) of any form node
570/// on the page, walking up `parent_map`. Returns its FormNodeId.0, or None.
571fn page_repeating_ancestor(
572    distinct_ids: &[usize],
573    tree: &FormTree,
574    parent_map: &[usize],
575) -> Option<usize> {
576    use xfa_layout_engine::form::FormNodeId;
577    for &start in distinct_ids {
578        let mut cur = start;
579        let mut depth = 0;
580        while cur != usize::MAX && depth < 4096 {
581            if cur < tree.nodes.len() && tree.get(FormNodeId(cur)).occur.is_repeating() {
582                return Some(cur);
583            }
584            cur = parent_map.get(cur).copied().unwrap_or(usize::MAX);
585            depth += 1;
586        }
587    }
588    None
589}
590
591fn compute_suppression_diags(
592    layout: &LayoutDom,
593    tree: &FormTree,
594    pre_js_nonempty: Option<&HashSet<FormNodeId>>,
595) -> Vec<flatten_trace::PageSuppressionDiag> {
596    let parent_map = build_parent_map(tree);
597    let n = layout.pages.len();
598    let trust_layout = suppression_trust_layout_enabled();
599    // Raw per-page keep (matches the suppression `map`). Under harvest-mode the
600    // "has field data" test uses the pre-JS snapshot so the diag reflects the
601    // real (harvest) keep decision rather than the post-JS live tree.
602    let raw: Vec<(bool, bool, bool)> = layout
603        .pages
604        .iter()
605        .map(|p| {
606            let hd = match pre_js_nonempty {
607                Some(snap) => page_has_field_data_snapshot(&p.nodes, snap),
608                None => page_has_field_data(&p.nodes, tree),
609            };
610            (p.runtime_instantiated, page_has_fields(&p.nodes, tree), hd)
611        })
612        .collect();
613    let raw_keep = |i: usize| -> bool {
614        // Mirrors the suppression `map`: keep when runtime-instantiated, when a
615        // field carries data, or when the page has no fields at all.
616        // (`(hf && hd) || !hf` reduces to `hd || !hf`.)
617        let (rt, hf, hd) = raw[i];
618        rt || hd || !hf
619    };
620    let any_keep = (0..n).any(raw_keep);
621
622    // Signatures for repeated-instance detection.
623    let mut sigs: Vec<Vec<usize>> = Vec::with_capacity(n);
624    for p in &layout.pages {
625        let mut s = Vec::new();
626        page_form_node_signature(&p.nodes, &mut s);
627        s.sort_unstable();
628        s.dedup();
629        sigs.push(s);
630    }
631
632    let mut diags = Vec::with_capacity(n);
633    for i in 0..n {
634        let (rt, hf, hd) = raw[i];
635        let (fc, ef, nf) = page_field_counts(&layout.pages[i].nodes, tree);
636        let static_chars = page_static_draw_chars(&layout.pages[i].nodes);
637        let dup = (0..i)
638            .find(|&j| sigs[j] == sigs[i])
639            .map_or(-1, |j| j as i64);
640        let (keep, reason) = if n <= 1 {
641            (true, "single_page")
642        } else if rt {
643            (true, "runtime_instantiated")
644        } else if hf && hd {
645            (true, "has_field_data")
646        } else if !hf {
647            (true, "no_fields_static_kept")
648        } else if any_keep {
649            // Data-empty page. Default drops it; with XFA_SUPPRESSION_TRUST_LAYOUT
650            // on, keep it when it still renders visible content.
651            if trust_layout && page_has_visible_content(&layout.pages[i].nodes) {
652                (true, "trust_layout_kept")
653            } else {
654                (false, "data_empty_dropped")
655            }
656        } else {
657            (true, "all_empty_kept")
658        };
659
660        // --- Layout provenance ---
661        let data_bound = page_data_bound_count(&layout.pages[i].nodes, tree);
662        let repeating_ancestor = page_repeating_ancestor(&sigs[i], tree, &parent_map);
663        let under_repeating = repeating_ancestor.is_some();
664        let occur_template_id = repeating_ancestor.map_or(-1, |id| id as i64);
665        let has_data = nf > 0 || data_bound > 0;
666        let page_reason = if rt {
667            "root_page"
668        } else if under_repeating && !has_data {
669            "repeated_empty_instance"
670        } else if under_repeating {
671            "occur_instance"
672        } else if has_data {
673            "continuation"
674        } else if static_chars > 0 {
675            "static_page_area"
676        } else {
677            "unknown"
678        };
679        let suppression_safe_to_drop = page_reason == "repeated_empty_instance";
680        let provenance_confidence = if rt || under_repeating || has_data {
681            "exact"
682        } else if static_chars > 0 {
683            "inferred"
684        } else {
685            "unknown"
686        };
687
688        diags.push(flatten_trace::PageSuppressionDiag {
689            page_index: i,
690            keep,
691            reason,
692            field_count: fc,
693            empty_field_count: ef,
694            nonempty_field_count: nf,
695            static_draw_text_chars: static_chars,
696            distinct_form_nodes: sigs[i].len(),
697            duplicate_of_page: dup,
698            runtime_instantiated: rt,
699            under_repeating_subform: under_repeating,
700            occur_template_id,
701            data_bound_nodes_count: data_bound,
702            page_reason,
703            suppression_safe_to_drop,
704            provenance_confidence,
705        });
706    }
707    diags
708}
709
710/// Flatten all XFA content in `pdf_bytes` to static PDF content streams.
711///
712/// Returns the modified PDF bytes. The /AcroForm entry is removed so the
713/// result is a plain PDF/1.4 document.
714///
715/// If the PDF has no XFA content, returns a clone of the input unchanged.
716///
717/// # Performance Target
718///
719/// P95 latency ≤ 5 seconds for 50-page documents (see
720/// `docs/XFA_SUCCESS_CRITERIA.md`).  The pipeline uses a 30-second hard
721/// timeout per document; pathological inputs fall back to `static_fallback`.
722///
723/// # Debug Logging
724///
725/// Enable debug logging with `RUST_LOG=pdf_xfa=debug`.
726///
727/// # Oracle Comparison Approach (XFA-F1-04)
728///
729/// Reference ("oracle") output for quality comparison is generated using:
730///
731/// 1. **pdfRest** — `POST https://api.pdfrest.com/flatten-pdf`
732///    Uses Adobe's XFA engine.  Highest fidelity.  Rate-limited to ~1200
733///    calls/month across two accounts.  Keys at
734///    `~/.config/pdfluent/pdfrest-keys.json`.
735///
736///    ```bash
737///    # curl -X POST "https://api.pdfrest.com/flatten-pdf" \
738///    #   -H "Api-Key: <KEY>" \
739///    #   --form "input=@form.xfa.pdf;type=application/pdf" \
740///    #   -o reference.pdfrest.pdf
741///    ```
742///
743/// 2. **mutool** — `mutool convert -o reference.pdf input.xfa.pdf`
744///    Secondary oracle.  Free, offline, limited XFA support.
745///
746///    ```bash
747///    # mutool convert -o reference.mutool.pdf input.xfa.pdf
748///    ```
749///
750/// Quality is measured as per-page SSIM vs. the pdfRest oracle (target ≥ 0.95).
751/// See `scripts/generate_xfa_reference.sh` and `docs/XFA_SUCCESS_CRITERIA.md`.
752#[must_use = "flattened PDF bytes must be used; discarding them loses output"]
753pub fn flatten_xfa_to_pdf(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
754    flatten_xfa_to_pdf_internal(pdf_bytes, false, XfaRenderingPolicy::SavedStateFaithful)
755        .map(|out| out.pdf_bytes)
756}
757/// Flatten XFA content and return the PDF bytes together with a per-page layout dump.
758///
759/// The [`LayoutDump`] is useful for CLI diagnostics and automated testing; use
760/// [`flatten_xfa_to_pdf`] when you only need the output bytes.
761///
762/// # Errors
763///
764/// Returns [`XfaError`] on parse, layout, or render failures.
765#[must_use = "flattened PDF bytes and layout dump must be used; discarding them loses output"]
766pub fn flatten_xfa_to_pdf_with_layout_dump(pdf_bytes: &[u8]) -> Result<(Vec<u8>, LayoutDump)> {
767    let out = flatten_xfa_to_pdf_internal(pdf_bytes, true, XfaRenderingPolicy::SavedStateFaithful)?;
768    Ok((out.pdf_bytes, out.layout_dump))
769}
770
771/// Flatten XFA content and return the PDF bytes together with [`FlattenMetadata`].
772///
773/// Metadata includes the dynamic-script outcome and overall output quality level.
774///
775/// # Errors
776///
777/// Returns [`XfaError`] on parse, layout, or render failures.
778#[must_use = "flattened PDF bytes and metadata must be used; discarding them loses output"]
779pub fn flatten_xfa_to_pdf_with_metadata(pdf_bytes: &[u8]) -> Result<(Vec<u8>, FlattenMetadata)> {
780    let out =
781        flatten_xfa_to_pdf_internal(pdf_bytes, false, XfaRenderingPolicy::SavedStateFaithful)?;
782    Ok((out.pdf_bytes, out.metadata))
783}
784
785/// Flatten XFA content and return the PDF bytes, a layout dump, and metadata in one call.
786///
787/// Combines [`flatten_xfa_to_pdf_with_layout_dump`] and
788/// [`flatten_xfa_to_pdf_with_metadata`] without running the pipeline twice.
789///
790/// # Errors
791///
792/// Returns [`XfaError`] on parse, layout, or render failures.
793#[must_use = "flattened PDF bytes, layout dump, and metadata must be used; discarding them loses output"]
794pub fn flatten_xfa_to_pdf_with_layout_dump_and_metadata(
795    pdf_bytes: &[u8],
796) -> Result<(Vec<u8>, LayoutDump, FlattenMetadata)> {
797    let out = flatten_xfa_to_pdf_internal(pdf_bytes, true, XfaRenderingPolicy::SavedStateFaithful)?;
798    Ok((out.pdf_bytes, out.layout_dump, out.metadata))
799}
800
801/// **D11/D12.** Flatten XFA content under an explicit [`XfaRenderingPolicy`].
802///
803/// [`XfaRenderingPolicy::SavedStateFaithful`] (the default) behaves identically
804/// to [`flatten_xfa_to_pdf`]. [`XfaRenderingPolicy::FreshMergeExperimental`]
805/// is D12-validated (9-doc set, GREEN, 2026-05-21) but remains experimental —
806/// corpus-scale measurement (D13) is pending.
807///
808/// # Errors
809///
810/// Returns [`XfaError`] on parse, layout, or render failures.
811#[must_use = "flattened PDF bytes must be used; discarding them loses output"]
812pub fn flatten_xfa_to_pdf_with_policy(
813    pdf_bytes: &[u8],
814    policy: XfaRenderingPolicy,
815) -> Result<Vec<u8>> {
816    flatten_xfa_to_pdf_with_policy_and_metadata(pdf_bytes, policy).map(|(bytes, _)| bytes)
817}
818
819/// **D11/D12.** Flatten XFA content under an explicit [`XfaRenderingPolicy`],
820/// returning the bytes and [`FlattenMetadata`] (whose `rendering_policy` field
821/// records the selected policy and `fresh_merge_admitted_nodes` the count of
822/// nodes admitted under `FreshMergeExperimental`).
823///
824/// # Errors
825///
826/// Returns [`XfaError`] on parse, layout, or render failures.
827#[must_use = "flattened PDF bytes and metadata must be used; discarding them loses output"]
828pub fn flatten_xfa_to_pdf_with_policy_and_metadata(
829    pdf_bytes: &[u8],
830    policy: XfaRenderingPolicy,
831) -> Result<(Vec<u8>, FlattenMetadata)> {
832    // D12: FreshMergeExperimental is now plumbed through the pipeline.
833    let out = flatten_xfa_to_pdf_internal(pdf_bytes, false, policy)?;
834    let mut metadata = out.metadata;
835    metadata.rendering_policy = policy;
836    Ok((out.pdf_bytes, metadata))
837}
838
839fn flatten_xfa_to_pdf_internal(
840    pdf_bytes: &[u8],
841    collect_layout_dump: bool,
842    policy: XfaRenderingPolicy,
843) -> Result<FlattenOutput> {
844    // GL-QA36: Re-entrance guard.  If this function is entered while already
845    // running on this thread (depth ≥ 1), a recursive call has occurred —
846    // most likely a fallback path returning the original bytes which still
847    // contain /AcroForm + xdp:xdp markers.  Abort immediately with an error
848    // to prevent the infinite recursion / stack overflow.
849    //
850    // The worker thread spawned below has its own thread-local so its depth
851    // starts at 0 and is unaffected by this guard.
852    let depth = FLATTEN_DEPTH.with(|d| d.get());
853    if depth >= 1 {
854        return Err(XfaError::LayoutFailed(
855            "flatten_xfa_to_pdf called recursively — aborting to prevent stack overflow".into(),
856        ));
857    }
858    FLATTEN_DEPTH.with(|d| d.set(depth + 1));
859    // Drop guard: decrement the counter even if we return early.
860    struct DepthGuard;
861    impl Drop for DepthGuard {
862        fn drop(&mut self) {
863            FLATTEN_DEPTH.with(|d| d.set(d.get().saturating_sub(1)));
864        }
865    }
866    let _depth_guard = DepthGuard;
867
868    // 0a. Quick byte-level pre-check: if the raw bytes don't contain /AcroForm
869    //     (where XFA lives per the spec) and no XDP namespace, skip expensive
870    //     parsing. This prevents multi-second stalls on large non-XFA PDFs.
871    if !pdf_bytes.windows(9).any(|w| w == b"/AcroForm")
872        && !pdf_bytes.windows(7).any(|w| w == b"xdp:xdp")
873    {
874        return Ok(FlattenOutput::without_dump(pdf_bytes.to_vec()));
875    }
876
877    // 0b. Handle encrypted PDFs: try empty-password decrypt (owner-only encryption),
878    //     otherwise reject early — encrypted content produces garbage output.
879    let decrypted;
880    let pdf_bytes = match try_decrypt_pdf(pdf_bytes) {
881        DecryptResult::NotEncrypted => pdf_bytes,
882        DecryptResult::Decrypted(bytes) => {
883            decrypted = bytes;
884            &decrypted
885        }
886        DecryptResult::NeedsPassword => {
887            return Err(XfaError::Encrypted(
888                "PDF is encrypted and requires a password".into(),
889            ));
890        }
891    };
892
893    // 1. Extract XFA packets.
894    let packets = match extract_xfa_from_bytes(pdf_bytes.to_vec()) {
895        Ok(p) => p,
896        Err(_) => {
897            // No XFA packet was extracted, but the byte-level pre-check already
898            // established that the document carries /AcroForm or XFA markers.
899            // Fall back to static cleanup so AcroForm-only inputs still flatten.
900            return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
901        }
902    };
903
904    let template_xml = match packets.template() {
905        Some(t) => strip_undefined_xml_entities(t),
906        None => {
907            // XFA present but template packet missing/unparseable (truncated XML).
908            // Strip AcroForm + NeedsRendering so renderers use static content.
909            return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
910        }
911    };
912
913    // 1b. Detect corrupt/minimal XFA: tiny PDFs (<1KB) whose template has no
914    //     real content (no <subform> or <pageSet> children) produce blank output.
915    //     Fall back to static page copy so the original pages are preserved.
916    if is_corrupt_xfa_template(pdf_bytes.len(), &template_xml) {
917        return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
918    }
919
920    // 2. Try XFA template → layout → render pipeline.
921    //    If this fails (parse error, empty template, layout 0 pages, lopdf error),
922    //    fall back to preserving the existing page content with AcroForm stripped.
923    //
924    //    Wrap in a thread-based timeout (30s) to prevent hangs on pathological
925    //    XFA documents. If the timeout fires, the join handle's result is an Err
926    //    and we fall back to static_fallback.
927    let datasets_xml_owned = packets.datasets().map(strip_undefined_xml_entities);
928    let form_xml_owned = packets.get_packet("form").map(|s| s.to_string());
929
930    #[cfg(not(target_arch = "wasm32"))]
931    {
932        // Native: wrap in a thread-based timeout (30s) so pathological XFA
933        // documents cannot hang a server. If the timeout fires, the join
934        // handle's result is an Err and we fall back to static_fallback.
935        const FLATTEN_TIMEOUT: Duration = Duration::from_secs(30);
936        let pdf_bytes_ref = pdf_bytes.to_vec();
937        let template_xml_owned = template_xml.clone();
938
939        let handle = thread::spawn(move || {
940            xfa_flatten_inner(
941                &pdf_bytes_ref,
942                &template_xml_owned,
943                datasets_xml_owned.as_deref(),
944                form_xml_owned.as_deref(),
945                collect_layout_dump,
946                policy,
947            )
948        });
949
950        match handle.join() {
951            Ok(Ok(out)) => Ok(out),
952            Ok(Err(e @ XfaError::UnsupportedFeature(_))) => Err(e),
953            Ok(Err(e)) => {
954                eprintln!("XFA flatten failed: {e:?}");
955                static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
956            }
957            Err(_) => {
958                eprintln!("XFA flatten timed out after {:?}", FLATTEN_TIMEOUT);
959                static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
960            }
961        }
962    }
963
964    // wasm32 has no `std::thread`, no preemption, and the host page is the
965    // natural cancellation boundary, so run the same pipeline inline (no
966    // watchdog thread — `thread::spawn` is unsupported on wasm32 and panics).
967    #[cfg(target_arch = "wasm32")]
968    {
969        match xfa_flatten_inner(
970            pdf_bytes,
971            &template_xml,
972            datasets_xml_owned.as_deref(),
973            form_xml_owned.as_deref(),
974            collect_layout_dump,
975            policy,
976        ) {
977            Ok(out) => Ok(out),
978            Err(e @ XfaError::UnsupportedFeature(_)) => Err(e),
979            Err(e) => {
980                eprintln!("XFA flatten failed: {e:?}");
981                static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
982            }
983        }
984    }
985}
986
987/// BE-1 tranche #1: collect the names of structural containers
988/// (`subform`/`subformSet`/`exclGroup`/`area`) declared anywhere in the XFA
989/// template. Installed on the sandboxed runtime before script execution so a
990/// bare implicit SOM reference to a declared-but-absent container resolves to a
991/// benign empty node (Adobe semantics) instead of `undefined`. Only built on
992/// the sandboxed path; the default binary never calls this.
993#[cfg(feature = "xfa-js-sandboxed")]
994fn collect_declared_container_names(template_xml: &str) -> std::collections::HashSet<String> {
995    let mut names = std::collections::HashSet::new();
996    if let Ok(doc) = roxmltree::Document::parse(template_xml) {
997        for node in doc.descendants() {
998            if node.is_element()
999                && matches!(
1000                    node.tag_name().name(),
1001                    "subform" | "subformSet" | "exclGroup" | "area"
1002                )
1003            {
1004                if let Some(name) = node.attribute("name") {
1005                    if !name.is_empty() {
1006                        names.insert(name.to_string());
1007                    }
1008                }
1009            }
1010        }
1011    }
1012    names
1013}
1014
1015/// Core XFA flatten pipeline: parse template, bind data, layout, render.
1016fn xfa_flatten_inner(
1017    pdf_bytes: &[u8],
1018    template_xml: &str,
1019    datasets_xml: Option<&str>,
1020    form_xml: Option<&str>,
1021    collect_layout_dump: bool,
1022    policy: XfaRenderingPolicy,
1023) -> Result<FlattenOutput> {
1024    // XFA-F6-01 (#1109): pipeline stage tracker — verifies strict ordering via
1025    // debug_assert in each stage transition below.
1026    let mut _stage = PipelineStage::Extract;
1027
1028    // PIPELINE: stage 0 — Extract (parse datasets and image files from PDF)
1029    log::debug!(
1030        "XFA flatten: {} bytes input, template={} bytes",
1031        pdf_bytes.len(),
1032        template_xml.len()
1033    );
1034
1035    let data_dom = if let Some(ds_xml) = datasets_xml {
1036        DataDom::from_xml(ds_xml)
1037            .map_err(|e| XfaError::ParseFailed(format!("datasets parse: {e}")))?
1038    } else {
1039        DataDom::new()
1040    };
1041
1042    // Extract embedded image files from the PDF for resolving <image href="…">
1043    // references in the XFA template (XFA §2.3).
1044    let image_files = match Document::load_mem(pdf_bytes) {
1045        Ok(doc) => extract_embedded_images(&doc),
1046        Err(_) => HashMap::new(),
1047    };
1048
1049    // XFA-F9-02 (#1121): Graceful degradation — warn on unsupported features
1050    // instead of failing silently.  These checks run once per document after
1051    // template extraction so they add negligible overhead.
1052    if template_xml.contains("barcode") {
1053        log::warn!("XFA barcode elements found but not supported — rendered as empty boxes");
1054    }
1055    if template_xml.contains("<signature") || template_xml.contains("<Signature") {
1056        log::warn!("XFA signature elements found but not supported — elements skipped");
1057    }
1058    if javascript_policy::template_mentions_javascript(template_xml) {
1059        log::warn!(
1060            "{}",
1061            javascript_policy::execution_denied_message(JavaScriptEntryPoint::XfaEventHook)
1062        );
1063    }
1064
1065    // PIPELINE: stage 1 — Bind (merge template with data DOM)
1066    debug_assert!(
1067        _stage <= PipelineStage::Bind,
1068        "pipeline stage order violated: expected <= Bind"
1069    );
1070    _stage = PipelineStage::Bind;
1071
1072    // Trace capture (env-gated emit at end of function; counts are cheap).
1073    let trace_image_files = image_files.len();
1074    let merger = FormMerger::new(&data_dom).with_image_files(image_files);
1075    let (mut tree, root_id) = merger
1076        .merge(template_xml)
1077        .map_err(|e| XfaError::ParseFailed(format!("template merge: {e}")))?;
1078
1079    log::debug!("XFA bind: {} form nodes created", tree.nodes.len());
1080
1081    // M3-B Phase C validation hook (2026-05-03):
1082    // Allow operators (CLI, integration tests, cohort runs) to engage the
1083    // sandboxed JavaScript runtime by setting `XFA_JS_EXECUTION_MODE`.
1084    // - unset / "default" / "best_effort_static" → existing default
1085    //   (`BestEffortStatic`), no behaviour change for any existing user.
1086    // - "strict" → `Strict` (M8 `DENY_EXECUTION`).
1087    // - "sandboxed" / "sandboxed_runtime" → `SandboxedRuntime` (Phase B+C).
1088    //   Only effective when the `xfa-js-sandboxed` Cargo feature is compiled
1089    //   in; otherwise NullRuntime returns NotCompiledIn and the dispatch
1090    //   path falls back to the same skip behaviour as `BestEffortStatic`.
1091    //
1092    // BE-1 harvest-mode (default OFF): snapshot data-bound field emptiness
1093    // BEFORE the scripts run, so the later §4.3 page-suppression can preserve
1094    // the static `data_empty_dropped` behaviour even when sandboxed JS populates
1095    // fields. `None` (flag off) → suppression uses the live tree (byte-identical).
1096    let pre_js_nonempty_fields: Option<HashSet<FormNodeId>> = if harvest_mode_enabled() {
1097        Some(snapshot_nonempty_field_ids(&tree))
1098    } else {
1099        None
1100    };
1101    let dynamic_scripts = match std::env::var("XFA_JS_EXECUTION_MODE")
1102        .ok()
1103        .map(|s| s.to_ascii_lowercase())
1104        .as_deref()
1105    {
1106        Some("strict") => {
1107            apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::Strict)?
1108        }
1109        Some("sandboxed") | Some("sandboxed_runtime") => {
1110            // Phase D-γ: create the runtime manually so we can call
1111            // `set_data_handle` before script execution, making the DataDom
1112            // accessible from `$record` and `xfa.resolveNodes("data.*")`.
1113            #[cfg(feature = "xfa-js-sandboxed")]
1114            {
1115                use crate::js_runtime::{NullRuntime, QuickJsRuntime, XfaJsRuntime};
1116                match QuickJsRuntime::new() {
1117                    Ok(mut rt) => {
1118                        rt.set_data_handle(&data_dom as *const _);
1119                        // BE-1 tranche #1: install template-declared container
1120                        // names so a bare implicit ref to a declared-but-absent
1121                        // subform resolves to a benign empty node (isNull=true)
1122                        // instead of `undefined`, letting guarded second-party
1123                        // scripts run their else branch (setInstances(0)/hide).
1124                        rt.set_declared_subform_names(collect_declared_container_names(
1125                            template_xml,
1126                        ));
1127                        apply_dynamic_scripts_with_runtime(
1128                            &mut tree,
1129                            root_id,
1130                            JsExecutionMode::SandboxedRuntime,
1131                            &mut rt,
1132                        )?
1133                    }
1134                    Err(_) => apply_dynamic_scripts_with_runtime(
1135                        &mut tree,
1136                        root_id,
1137                        JsExecutionMode::SandboxedRuntime,
1138                        &mut NullRuntime::new(),
1139                    )?,
1140                }
1141            }
1142            #[cfg(not(feature = "xfa-js-sandboxed"))]
1143            apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::SandboxedRuntime)?
1144        }
1145        _ => apply_dynamic_scripts(&mut tree, root_id)?,
1146    };
1147    if dynamic_scripts.output_quality != OutputQuality::Exact {
1148        // M3-B Phase C (2026-05-03): appended host-binding counters after
1149        // the Phase B JS runtime counters. Defaults stay 0 in
1150        // `BestEffortStatic` mode so existing log parsers remain compatible.
1151        log::warn!(
1152            "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={} js_unsupported_host_calls={} js_probe_skips={}",
1153            dynamic_scripts.output_quality.as_str(),
1154            dynamic_scripts.js_present,
1155            dynamic_scripts.js_skipped,
1156            dynamic_scripts.other_skipped,
1157            dynamic_scripts.formcalc_run,
1158            dynamic_scripts.formcalc_errors,
1159            dynamic_scripts.js_executed,
1160            dynamic_scripts.js_runtime_errors,
1161            dynamic_scripts.js_timeouts,
1162            dynamic_scripts.js_oom,
1163            dynamic_scripts.js_host_calls,
1164            dynamic_scripts.js_mutations,
1165            dynamic_scripts.js_instance_writes,
1166            dynamic_scripts.js_list_writes,
1167            dynamic_scripts.js_binding_errors,
1168            dynamic_scripts.js_resolve_failures,
1169            dynamic_scripts.js_data_reads,
1170            dynamic_scripts.js_unsupported_host_calls,
1171            dynamic_scripts.js_probe_skips,
1172        );
1173        eprintln!(
1174            "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={} js_unsupported_host_calls={} js_probe_skips={}",
1175            dynamic_scripts.output_quality.as_str(),
1176            dynamic_scripts.js_present,
1177            dynamic_scripts.js_skipped,
1178            dynamic_scripts.other_skipped,
1179            dynamic_scripts.formcalc_run,
1180            dynamic_scripts.formcalc_errors,
1181            dynamic_scripts.js_executed,
1182            dynamic_scripts.js_runtime_errors,
1183            dynamic_scripts.js_timeouts,
1184            dynamic_scripts.js_oom,
1185            dynamic_scripts.js_host_calls,
1186            dynamic_scripts.js_mutations,
1187            dynamic_scripts.js_instance_writes,
1188            dynamic_scripts.js_list_writes,
1189            dynamic_scripts.js_binding_errors,
1190            dynamic_scripts.js_resolve_failures,
1191            dynamic_scripts.js_data_reads,
1192            dynamic_scripts.js_unsupported_host_calls,
1193            dynamic_scripts.js_probe_skips,
1194        );
1195    }
1196
1197    // XFA §3: when the PDF contains a pre-merged form DOM (saved by Adobe's
1198    // runtime after scripts executed), use its presence attributes to override
1199    // the template-based defaults. This captures script-driven visibility
1200    // changes (e.g. Avoka framework's sfcUtils.updateVisibility) that our
1201    // FormCalc interpreter cannot execute.
1202    // Graduated default-ON (static-parity-rc1): `SavedStateFaithful` admits
1203    // data-bound unmatched subforms through the same guarded branch as
1204    // `FreshMergeExperimental` instead of suppressing them, WITHOUT flipping the
1205    // policy default. Opt out with `XFA_FORMDOM_ADMIT_DATABOUND=0|off|false`.
1206    // Read here at the pipeline boundary (like `XFA_JS_EXECUTION_MODE`) so the
1207    // presence logic stays a pure, deterministically-testable function. No-op
1208    // under `FreshMergeExperimental` (which already admits the same set).
1209    let admit_databound_override = std::env::var("XFA_FORMDOM_ADMIT_DATABOUND")
1210        .map(|v| {
1211            let v = v.trim();
1212            !(v.is_empty()
1213                || v == "0"
1214                || v.eq_ignore_ascii_case("off")
1215                || v.eq_ignore_ascii_case("false"))
1216        })
1217        .unwrap_or(true);
1218    let (fresh_merge_admitted, form_dom_match_failures, form_dom_match_log) =
1219        if let Some(fxml) = form_xml {
1220            apply_form_dom_presence(&mut tree, root_id, fxml, policy, admit_databound_override)
1221        } else {
1222            (0, 0, Vec::new())
1223        };
1224
1225    // Resolve fonts BEFORE layout so the layout engine uses actual font metrics
1226    // (widths, ascender, descender) instead of generic AFM tables.
1227    let resolved_fonts = resolve_template_fonts(template_xml, pdf_bytes);
1228    inject_resolved_metrics(&mut tree, &resolved_fonts);
1229
1230    // PIPELINE: stage 2 — Layout (compute page positions using resolved font metrics)
1231    debug_assert!(
1232        _stage <= PipelineStage::Layout,
1233        "pipeline stage order violated: expected <= Layout"
1234    );
1235    _stage = PipelineStage::Layout;
1236
1237    let engine = LayoutEngine::new(&tree);
1238    let (mut layout, mut layout_dump) = if collect_layout_dump {
1239        let (layout, profile) = engine
1240            .layout_with_profile(root_id)
1241            .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
1242        (layout, Some(layout_dump_from_profile(profile)))
1243    } else {
1244        let layout = engine
1245            .layout(root_id)
1246            .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
1247        (layout, None)
1248    };
1249
1250    if layout.pages.is_empty() {
1251        return Err(XfaError::LayoutFailed("layout produced 0 pages".into()));
1252    }
1253
1254    log::debug!("XFA layout: {} pages produced", layout.pages.len());
1255
1256    // XFA Spec §4.3: suppress page subforms whose data is empty or absent.
1257    // A page with fields but no populated values is considered "data-empty"
1258    // and should be suppressed.  Pages without fields (static-only pages with
1259    // draws/images) are always kept.  At least one page is retained.
1260    //
1261    // Guard: when the form carries datasets data but our binding has not yet
1262    // propagated values into LayoutContent::Field.value for any page (e.g.
1263    // nested or namespace-qualified data paths), keep all layout pages.
1264    // Suppressing when binding is incomplete would incorrectly drop pages of
1265    // explicitly-paginated documents whose fields appear data-empty to this
1266    // heuristic even though real data is present.
1267    let trace_pages_produced = layout.pages.len();
1268    // Capture per-page suppression diagnostics BEFORE the retain mutates pages
1269    // (env-gated; empty when tracing is off).
1270    let trace_suppression = if flatten_trace::enabled() {
1271        compute_suppression_diags(&layout, &tree, pre_js_nonempty_fields.as_ref())
1272    } else {
1273        Vec::new()
1274    };
1275    // Default-off opt-in: trust the layout page count and keep every laid-out
1276    // page that still renders visible content, rather than dropping data-empty
1277    // pages. Pairs with the layout over-production fix (see helper docs).
1278    let trust_layout = suppression_trust_layout_enabled();
1279    if layout.pages.len() > 1 {
1280        let keep: Vec<bool> = layout
1281            .pages
1282            .iter()
1283            .map(|p| {
1284                // XFA 3.3 §3.1 / §8.6: pages emitted onto runtime-allocated
1285                // pageAreas (recorded in the form-DOM packet) are an explicit
1286                // commitment by Adobe's runtime — never drop them on a
1287                // data-empty heuristic.
1288                if p.runtime_instantiated {
1289                    true
1290                } else if page_has_fields(&p.nodes, &tree) {
1291                    // Default: keep only when a field carries data. With
1292                    // XFA_SUPPRESSION_TRUST_LAYOUT on, also keep data-empty
1293                    // pages that still render visible content (the `&&`
1294                    // short-circuits, so flag-off is byte-identical).
1295                    //
1296                    // BE-1 harvest-mode: when a pre-JS snapshot is present,
1297                    // base "has data" on the data-bound (pre-JS) field values
1298                    // so sandboxed JS field population can't keep an otherwise
1299                    // data-empty page. `None` (flag off) → live-tree check,
1300                    // byte-identical to the static default.
1301                    let has_field_data = match pre_js_nonempty_fields {
1302                        Some(ref snap) => page_has_field_data_snapshot(&p.nodes, snap),
1303                        None => page_has_field_data(&p.nodes, &tree),
1304                    };
1305                    has_field_data || (trust_layout && page_has_visible_content(&p.nodes))
1306                } else {
1307                    true
1308                }
1309            })
1310            .collect();
1311        let any_keep = keep.iter().any(|&k| k);
1312        if any_keep {
1313            let mut idx = 0;
1314            layout.pages.retain(|_| {
1315                let k = keep[idx];
1316                idx += 1;
1317                k
1318            });
1319            if let Some(ref mut dump) = layout_dump {
1320                let mut idx = 0;
1321                dump.pages.retain(|_| {
1322                    let k = keep[idx];
1323                    idx += 1;
1324                    k
1325                });
1326            }
1327        }
1328        // When NO page has data, keep all pages: the form is empty and
1329        // all structural pages should be preserved (e.g. a 6-page
1330        // inspection report with no filled-in values).
1331    }
1332
1333    if let Some(ref mut dump) = layout_dump {
1334        renumber_layout_dump_pages(dump);
1335    }
1336
1337    // PIPELINE: stage 3 — Render (generate XFA overlay content streams from layout)
1338    debug_assert!(
1339        _stage <= PipelineStage::Render,
1340        "pipeline stage order violated: expected <= Render"
1341    );
1342    _stage = PipelineStage::Render;
1343
1344    let mut doc = match Document::load_mem(pdf_bytes) {
1345        Ok(d) => d,
1346        Err(_) => {
1347            eprintln!("lopdf load failed, creating minimal PDF structure for XFA layout");
1348            create_minimal_pdf_document()
1349        }
1350    };
1351
1352    // PIPELINE: stage 4 — Embed (embed fonts/images into PDF document)
1353    debug_assert!(
1354        _stage <= PipelineStage::Embed,
1355        "pipeline stage order violated: expected <= Embed"
1356    );
1357    _stage = PipelineStage::Embed;
1358
1359    // PERF: embed_resolved_fonts is O(f * p) where f = unique resolved fonts
1360    // and p = PDF pages.  Each font requires a full font-program copy into the
1361    // PDF object stream plus /Widths array serialisation.  For documents with
1362    // many embedded fonts and many pages this is the dominant allocation source.
1363    // Potential optimisation: share font objects across pages (already done for
1364    // standard Type1 fonts F1-F3; extend to TrueType/CID fonts).
1365    let (font_map, embedded_font_objects, metrics_data) =
1366        embed_resolved_fonts(&mut doc, &resolved_fonts, &layout);
1367
1368    let config = XfaRenderConfig {
1369        font_map: std::sync::Arc::new(font_map),
1370        font_metrics_data: std::sync::Arc::new(metrics_data),
1371        ..Default::default()
1372    };
1373
1374    let overlays = generate_all_overlays(&layout, &config)
1375        .map_err(|e| XfaError::LayoutFailed(format!("overlay generation: {e:?}")))?;
1376
1377    log::debug!(
1378        "XFA render: {} content streams generated ({} bytes total)",
1379        overlays.len(),
1380        overlays
1381            .iter()
1382            .map(|o| o.content_stream.len())
1383            .sum::<usize>()
1384    );
1385
1386    // Register standard PDF fonts: F1=Times-Roman (serif), F2=Helvetica (sans), F3=Courier (mono).
1387    let font_ids: [ObjectId; 3] = [
1388        doc.add_object(Object::Dictionary(dictionary! {
1389            "Type"     => Object::Name(b"Font".to_vec()),
1390            "Subtype"  => Object::Name(b"Type1".to_vec()),
1391            "BaseFont" => Object::Name(b"Times-Roman".to_vec()),
1392            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1393        })),
1394        doc.add_object(Object::Dictionary(dictionary! {
1395            "Type"     => Object::Name(b"Font".to_vec()),
1396            "Subtype"  => Object::Name(b"Type1".to_vec()),
1397            "BaseFont" => Object::Name(b"Helvetica".to_vec()),
1398            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1399        })),
1400        doc.add_object(Object::Dictionary(dictionary! {
1401            "Type"     => Object::Name(b"Font".to_vec()),
1402            "Subtype"  => Object::Name(b"Type1".to_vec()),
1403            "BaseFont" => Object::Name(b"Courier".to_vec()),
1404            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1405        })),
1406    ];
1407
1408    let existing_page_ids: Vec<ObjectId> = doc.page_iter().collect();
1409    let n_layout = overlays.len();
1410    let n_existing = existing_page_ids.len();
1411
1412    // XFA Spec 3.3 §9.1 — Static vs Dynamic Forms: a form is static (XFAF)
1413    // when it uses only the restricted XFAF grammar subset (§7.6).  In
1414    // practice, Adobe identifies static forms by `baseProfile="interactiveForms"`
1415    // on the <template> element.  A dynamic form uses the full XFA grammar
1416    // and re-lays out content based on data/scripts.
1417    //
1418    // §7.6 enumerates grammar excluded from XFAF: area, occur (non-default),
1419    // multiple pageAreas, scripts that modify instance count, etc.
1420    //
1421    // Our detection uses baseProfile — this matches Adobe's behavior.  A more
1422    // rigorous check would inspect the template grammar for XFAF-excluded
1423    // elements, but baseProfile is the standard signal in real-world PDFs.
1424    let is_static_form = template_xml.contains("baseProfile=\"interactiveForms\"");
1425    let has_static_content = pages_have_static_content(&doc);
1426
1427    // Preserve pre-rendered PDF page content when:
1428    // 1. Explicit static form (baseProfile="interactiveForms"), OR
1429    // 2. Pages have substantial pre-rendered content AND the XFA layout
1430    //    produces at least as many pages as the original AND the XFA overlay
1431    //    has enough content to indicate a full page re-render.
1432    // 3. Layout engine produces fewer pages than the original — regardless
1433    //    of whether we detect static content in page streams. XFA PDFs often
1434    //    have form content in widget annotations rather than page content
1435    //    streams, so `has_static_content` may return false even when pages
1436    //    have substantial pre-rendered form content. When our layout is
1437    //    incomplete (fewer pages), preserving the original pages matches
1438    //    Adobe/pdfRest output better than truncated single-page output.
1439    //
1440    //    When the XFA overlay is minimal (e.g. just a title/header), the form
1441    //    relies on AcroForm widgets for its content. Preserving static content
1442    //    + baking widgets adds spurious form fields. Using the XFA path gives
1443    //    the correct minimal output matching pdfrest/Adobe behavior.
1444    //    When the XFA overlay is substantial (re-renders the full page), the
1445    //    pre-rendered content is authoritative — replacing it with XFA causes
1446    //    SSIM regressions due to font/rendering differences.
1447    //
1448    // The 1000-byte threshold separates minimal XFA templates (title/header
1449    // only, ~200-500 bytes) from full page re-renders (5000+ bytes).
1450    let overlay_is_substantial = overlays.iter().any(|o| o.content_stream.len() > 1000);
1451    // Clamp 1-page over-pagination only for static XFAF forms. Dynamic forms
1452    // often start from a 1-page placeholder PDF and legitimately flow onto
1453    // additional pages once XFA data is laid out. Clamping all 1-page inputs
1454    // to the original page count causes under-pagination on dynamic forms such
1455    // as Travel Expense Report / Checklist where Adobe renders 2-3 pages.
1456    //
1457    let preserve_static =
1458        is_static_form || n_layout < n_existing || has_static_content && overlay_is_substantial;
1459
1460    // PIPELINE: stage 5 — Write (write content streams to PDF pages)
1461    debug_assert!(
1462        _stage <= PipelineStage::Write,
1463        "pipeline stage order violated: expected <= Write"
1464    );
1465    _stage = PipelineStage::Write;
1466
1467    let mut trace_widgets_baked = 0usize;
1468    let mut trace_excess_deleted = 0usize;
1469    if preserve_static {
1470        let baked = flatten_widget_appearances(&mut doc);
1471        trace_widgets_baked = baked;
1472        if baked == 0 {
1473            // No widget APs were baked — the form structure lives in the
1474            // pre-rendered page content but field values exist only in the
1475            // XFA overlay.  Generate a lightweight overlay with just field
1476            // value text (no backgrounds/borders/captions) and append it
1477            // on top so field values become visible without visual artifacts.
1478            if let Ok(fv_overlays) = generate_field_values_overlays(&layout, &config) {
1479                for (i, overlay) in fv_overlays.iter().enumerate() {
1480                    if i < n_existing && !overlay.content_stream.is_empty() {
1481                        let _ = overlay_page_content(
1482                            &mut doc,
1483                            existing_page_ids[i],
1484                            overlay,
1485                            &font_ids,
1486                            &embedded_font_objects,
1487                        );
1488                    }
1489                }
1490            }
1491        }
1492        // When widgets WERE baked, their AP streams already contain field
1493        // content.  Overlaying XFA on top of baked widget appearances
1494        // causes ghost/double text because widget APs may contain rotation
1495        // matrices that produce differently-positioned text.
1496    } else {
1497        // Dynamic form: the layout engine determines page count.
1498        // Write each layout page to the output: overwrite existing pages
1499        // and add new pages when the layout produces more than the original.
1500        // NOTE: page cap (n_layout.min(n_existing)) was removed — it caused
1501        // 30 GATE #12 regressions because dynamic XFA forms often have a
1502        // single placeholder page while the actual form has many data-driven
1503        // pages. Capping to n_existing destroyed multi-page content.
1504        for (i, overlay) in overlays.iter().enumerate() {
1505            if i < n_existing {
1506                let lp = &layout.pages[i];
1507                write_page_content(
1508                    &mut doc,
1509                    existing_page_ids[i],
1510                    overlay,
1511                    &font_ids,
1512                    &embedded_font_objects,
1513                    Some(lp.width),
1514                    Some(lp.height),
1515                )?;
1516            } else {
1517                let lp = &layout.pages[i];
1518                add_new_page(
1519                    &mut doc,
1520                    lp.width,
1521                    lp.height,
1522                    overlay,
1523                    &font_ids,
1524                    &embedded_font_objects,
1525                )?;
1526            }
1527        }
1528
1529        // Bake checkbox/radio AP marks from AcroForm widgets onto existing
1530        // pages.  The XFA overlay draws borders and captions; the AP "on"
1531        // stream adds the filled mark (circle, checkmark, etc.) that the
1532        // oracle renders for hybrid forms (#886).
1533        for &page_id in &existing_page_ids[..n_existing.min(n_layout)] {
1534            bake_checkbox_radio_ap_marks(&mut doc, page_id);
1535        }
1536    }
1537
1538    // Remove excess pages when XFA layout produces fewer pages than the
1539    // original static content. This is the core fix for over-pagination
1540    // (#744): XFA PDFs often carry pre-rendered static pages that far exceed
1541    // the dynamic page count Adobe would produce.
1542    // But for static/hybrid forms (preserve_static), keep all original pages —
1543    // the static content lives in the PDF page streams, not in XFA draw
1544    // elements (#750).
1545    if n_layout < n_existing && !preserve_static {
1546        // delete_pages takes 1-indexed page numbers, highest first to avoid
1547        // index shifts.
1548        let excess: Vec<u32> = ((n_layout + 1) as u32..=(n_existing as u32))
1549            .rev()
1550            .collect();
1551        trace_excess_deleted = excess.len();
1552        doc.delete_pages(&excess);
1553    }
1554
1555    if is_static_form {
1556        // Static forms: strip Widget annotations but keep non-Widget (links,
1557        // stamps, etc.).  flatten_widget_appearances already baked widgets
1558        // with AP into the page content and removed them from Annots, but
1559        // widgets without AP may remain.  Remove those too so PDF viewers
1560        // don't render interactive fields over the baked content.
1561        for &page_id in &existing_page_ids {
1562            strip_widget_annotations(&mut doc, page_id);
1563        }
1564    } else {
1565        // Dynamic/hybrid forms: strip ALL annotations — pages were
1566        // overwritten by XFA layout or widget baking covered field values.
1567        for &page_id in existing_page_ids.iter().take(n_layout.min(n_existing)) {
1568            if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
1569                dict.remove(b"Annots");
1570            }
1571        }
1572    }
1573
1574    // PIPELINE: stage 6 — Cleanup (remove AcroForm/XFA markers)
1575    debug_assert!(
1576        _stage <= PipelineStage::Cleanup,
1577        "pipeline stage order violated: expected <= Cleanup"
1578    );
1579    #[allow(unused_assignments)]
1580    {
1581        _stage = PipelineStage::Cleanup;
1582    }
1583
1584    remove_acroform(&mut doc);
1585    let stripped_js = javascript_policy::strip_javascript_for_flatten(&mut doc);
1586    if stripped_js > 0 {
1587        log::warn!("stripped {stripped_js} JavaScript action(s) from flattened output");
1588    }
1589
1590    // Env-gated flatten trace (default OFF): capture stage signals BEFORE the
1591    // document is consumed by serialization. Built only when XFA_FLATTEN_TRACE set.
1592    let trace_ctx = if flatten_trace::enabled() {
1593        let (acroform_removed, xfa_removed_structural, needs_rendering_removed) =
1594            catalog_cleanup_status(&doc);
1595        Some((
1596            acroform_removed,
1597            xfa_removed_structural,
1598            needs_rendering_removed,
1599            doc.page_iter().count(),
1600            layout
1601                .pages
1602                .iter()
1603                .filter(|p| p.runtime_instantiated)
1604                .count(),
1605        ))
1606    } else {
1607        None
1608    };
1609
1610    let mut out = Vec::new();
1611    doc.save_to(&mut out)
1612        .map_err(|e| XfaError::LayoutFailed(format!("save: {e}")))?;
1613
1614    if let Some((
1615        acroform_removed,
1616        xfa_removed_structural,
1617        needs_rendering_removed,
1618        output_page_count,
1619        runtime_pages,
1620    )) = trace_ctx
1621    {
1622        let js_mode =
1623            std::env::var("XFA_JS_EXECUTION_MODE").unwrap_or_else(|_| "best_effort_static".into());
1624        flatten_trace::emit(&flatten_trace::TraceInputs {
1625            suppression: &trace_suppression,
1626            input_bytes: pdf_bytes.len(),
1627            template_bytes: template_xml.len(),
1628            js_execution_mode: &js_mode,
1629            flatten_path: if preserve_static {
1630                "static_preserve"
1631            } else {
1632                "dynamic"
1633            },
1634            template_packet_found: true,
1635            datasets_packet_found: datasets_xml.is_some(),
1636            form_packet_found: form_xml.is_some(),
1637            image_files: trace_image_files,
1638            tree: &tree,
1639            scripts: &dynamic_scripts,
1640            layout: &layout,
1641            pages_produced: trace_pages_produced,
1642            pages_after_suppression: layout.pages.len(),
1643            runtime_instantiated_pages: runtime_pages,
1644            overlays: &overlays,
1645            n_layout,
1646            n_existing,
1647            is_static_form,
1648            has_static_content,
1649            preserve_static,
1650            excess_pages_deleted: trace_excess_deleted,
1651            widgets_baked: trace_widgets_baked,
1652            acroform_removed,
1653            xfa_removed_structural,
1654            needs_rendering_removed,
1655            javascript_actions_stripped: stripped_js,
1656            output_bytes: out.len(),
1657            output_page_count,
1658        });
1659    }
1660
1661    // Epic A E-5: patch match-failure data into the dynamic_scripts outcome.
1662    let mut dynamic_scripts = dynamic_scripts;
1663    dynamic_scripts.form_dom_match_failures = form_dom_match_failures;
1664    dynamic_scripts.form_dom_match_log = form_dom_match_log;
1665
1666    let mut flatten_out = FlattenOutput::new(out, layout_dump.unwrap_or_default(), dynamic_scripts);
1667    flatten_out.metadata.fresh_merge_admitted_nodes = fresh_merge_admitted;
1668    Ok(flatten_out)
1669}
1670
1671/// Inspect the catalog after cleanup: returns
1672/// `(acroform_removed, xfa_removed_structural, needs_rendering_removed)`.
1673/// Used by the env-gated flatten trace to confirm structural XFA removal.
1674fn catalog_cleanup_status(doc: &Document) -> (bool, bool, bool) {
1675    let root_id = match doc.trailer.get(b"Root") {
1676        Ok(Object::Reference(id)) => *id,
1677        _ => return (true, true, true),
1678    };
1679    let Ok(cat) = doc.get_dictionary(root_id) else {
1680        return (true, true, true);
1681    };
1682    let acroform_present = cat.get(b"AcroForm").is_ok();
1683    let needs_rendering_present = cat.get(b"NeedsRendering").is_ok();
1684    let direct_xfa = cat.get(b"XFA").is_ok();
1685    let acroform_xfa = cat
1686        .get(b"AcroForm")
1687        .ok()
1688        .and_then(|o| match o {
1689            Object::Reference(id) => doc.get_dictionary(*id).ok(),
1690            Object::Dictionary(d) => Some(d),
1691            _ => None,
1692        })
1693        .map(|d| d.get(b"XFA").is_ok())
1694        .unwrap_or(false);
1695    (
1696        !acroform_present,
1697        !(direct_xfa || acroform_xfa),
1698        !needs_rendering_present,
1699    )
1700}
1701
1702fn layout_dump_from_profile(profile: LayoutProfile) -> LayoutDump {
1703    LayoutDump {
1704        pages: profile
1705            .pages
1706            .into_iter()
1707            .enumerate()
1708            .map(|(idx, page)| LayoutDumpEntry {
1709                page_num: idx as u32 + 1,
1710                page_height: page.page_height,
1711                used_height: page.used_height,
1712                overflow_to_next: page.overflow_to_next,
1713                first_overflow_element: page.first_overflow_element,
1714            })
1715            .collect(),
1716        ..Default::default()
1717    }
1718}
1719
1720fn renumber_layout_dump_pages(dump: &mut LayoutDump) {
1721    for (idx, page) in dump.pages.iter_mut().enumerate() {
1722        page.page_num = idx as u32 + 1;
1723    }
1724}
1725
1726// ---------------------------------------------------------------------------
1727// Embedded image files extraction (XFA §2.3 href resolution)
1728// ---------------------------------------------------------------------------
1729
1730/// Extract embedded files from the PDF's Names/EmbeddedFiles tree.
1731///
1732/// XFA `<image href=".\filename.jpg">` references are resolved against this
1733/// tree at merge time (XFA Spec 3.3 §2.3).  The returned map is keyed by
1734/// the filename as it appears in the Names array (e.g. `.\lintje.jpg`).
1735fn extract_embedded_images(doc: &Document) -> HashMap<String, Vec<u8>> {
1736    let mut images = HashMap::new();
1737
1738    // Helper: resolve a potentially indirect object.
1739    fn deref_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
1740        match obj {
1741            Object::Reference(id) => doc.get_dictionary(*id).ok(),
1742            Object::Dictionary(d) => Some(d),
1743            _ => None,
1744        }
1745    }
1746
1747    // Helper: extract stream content (decompressed).
1748    fn extract_stream(doc: &Document, obj: &Object) -> Option<Vec<u8>> {
1749        let stream_obj = match obj {
1750            Object::Reference(id) => doc.get_object(*id).ok()?,
1751            other => other,
1752        };
1753        if let Object::Stream(ref stream) = *stream_obj {
1754            let mut s = stream.clone();
1755            let _ = s.decompress();
1756            Some(s.content.clone())
1757        } else {
1758            None
1759        }
1760    }
1761
1762    // Traverse: Catalog → /Names → /EmbeddedFiles → /Names array
1763    let catalog = match doc.catalog() {
1764        Ok(c) => c,
1765        Err(_) => return images,
1766    };
1767    let names_obj = match catalog.get(b"Names") {
1768        Ok(obj) => obj,
1769        Err(_) => {
1770            eprintln!("[img-href] no /Names in catalog");
1771            return images;
1772        }
1773    };
1774    let names_dict = match deref_dict(doc, names_obj) {
1775        Some(d) => d,
1776        None => return images,
1777    };
1778    // XFA PDFs may use /XFAImages instead of /EmbeddedFiles for image
1779    // references.  Check both keys.
1780    let ef_obj = match names_dict
1781        .get(b"XFAImages")
1782        .or_else(|_| names_dict.get(b"EmbeddedFiles"))
1783    {
1784        Ok(obj) => obj,
1785        Err(_) => return images,
1786    };
1787    let ef_dict = match deref_dict(doc, ef_obj) {
1788        Some(d) => d,
1789        None => return images,
1790    };
1791
1792    // The EmbeddedFiles name tree has a /Names array: [(name1, ref1), …]
1793    let names_arr_obj = match ef_dict.get(b"Names") {
1794        Ok(obj) => obj,
1795        Err(_) => return images,
1796    };
1797    let names_array = match names_arr_obj {
1798        Object::Array(arr) => arr,
1799        Object::Reference(id) => match doc.get_object(*id) {
1800            Ok(Object::Array(arr)) => arr,
1801            _ => return images,
1802        },
1803        _ => return images,
1804    };
1805
1806    // Process pairs: (name_string, value_ref)
1807    let mut i = 0;
1808    while i + 1 < names_array.len() {
1809        let name = match &names_array[i] {
1810            Object::String(bytes, _) => String::from_utf8_lossy(bytes).to_string(),
1811            _ => {
1812                i += 2;
1813                continue;
1814            }
1815        };
1816
1817        // The value can be:
1818        //   1. A FileSpec dict: /EF → /F → stream
1819        //   2. Directly a stream (non-standard but seen in XFA PDFs)
1820        let value_ref = &names_array[i + 1];
1821
1822        // Try path 1: FileSpec dict
1823        if let Some(filespec) = deref_dict(doc, value_ref) {
1824            if let Ok(ef_obj) = filespec.get(b"EF") {
1825                if let Some(ef) = deref_dict(doc, ef_obj) {
1826                    if let Ok(f_ref) = ef.get(b"F") {
1827                        if let Some(data) = extract_stream(doc, f_ref) {
1828                            images.insert(name.clone(), data);
1829                            i += 2;
1830                            continue;
1831                        }
1832                    }
1833                }
1834            }
1835        }
1836
1837        // Try path 2: Direct stream reference
1838        if let Some(data) = extract_stream(doc, value_ref) {
1839            images.insert(name.clone(), data);
1840        }
1841
1842        i += 2;
1843    }
1844    images
1845}
1846
1847// ---------------------------------------------------------------------------
1848// Font extraction, resolution, and embedding
1849// ---------------------------------------------------------------------------
1850
1851/// Extract embedded font programs from a lopdf `Document`, including `/Widths`
1852/// arrays and encoding metadata.
1853///
1854/// This is the flatten-pipeline-internal variant. It differs from the public
1855/// `extract::extract_embedded_fonts` in three ways:
1856/// - Input type: `lopdf::Document` (lopdf object model) vs `pdf_syntax::Pdf`.
1857/// - Return type: [`EmbeddedFontData`] structs (with widths + encoding) vs
1858///   plain `(name, bytes)` tuples.
1859/// - Purpose: metric capture for layout + font embedding inside flatten.
1860///   Not intended for external callers; use `pdf_xfa::extract_embedded_fonts`
1861///   for inspection-only use cases.
1862///
1863/// Canonical public API: [`crate::extract::extract_embedded_fonts`].
1864#[doc(hidden)]
1865pub fn extract_embedded_fonts(doc: &Document) -> Vec<EmbeddedFontData> {
1866    let mut fonts = Vec::new();
1867    let mut seen = std::collections::HashSet::new();
1868    for (&font_object_id, obj) in &doc.objects {
1869        let dict = match obj.as_dict() {
1870            Ok(d) => d,
1871            Err(_) => continue,
1872        };
1873        let is_font =
1874            dict.get(b"Type").ok().and_then(|o| o.as_name().ok()) == Some(b"Font".as_slice());
1875        if !is_font {
1876            continue;
1877        }
1878        let base_font = match dict.get(b"BaseFont").ok().and_then(|o| o.as_name().ok()) {
1879            Some(n) => String::from_utf8_lossy(n).to_string(),
1880            None => continue,
1881        };
1882
1883        let pdf_widths = extract_font_widths(dict);
1884        let pdf_encoding = extract_font_encoding(doc, dict);
1885        let pdf_source_font =
1886            extract_simple_pdf_source_font(doc, font_object_id, dict, pdf_widths.as_ref());
1887
1888        // First try direct FontDescriptor path (simple TrueType/OpenType fonts)
1889        if let Some((stream_id, data)) = extract_font_from_direct_fd(doc, dict, &base_font) {
1890            if seen.insert(stream_id) {
1891                store_font_data(
1892                    &mut fonts,
1893                    &base_font,
1894                    data,
1895                    pdf_widths.clone(),
1896                    pdf_encoding.clone(),
1897                    pdf_source_font,
1898                );
1899            }
1900            continue;
1901        }
1902
1903        // For CIDFont Type0: also check DescendantFonts path
1904        // CIDFont fonts store their font data in /DescendantFonts[n]/FontDescriptor/FontFile*
1905        // CID fonts use /W arrays (PDF spec §9.7.4.3) instead of simple /Widths.
1906        if let Some((stream_id, data)) = extract_cidfont_data(doc, dict, &base_font, &seen) {
1907            if seen.insert(stream_id) {
1908                let cid_widths = extract_cid_font_widths(doc, dict);
1909                store_font_data(&mut fonts, &base_font, data, cid_widths, None, None);
1910            }
1911            continue;
1912        }
1913
1914        if let Some(source_font) = pdf_source_font {
1915            // fix(#811): if the source PDF already exposes a reusable simple
1916            // font object with /Widths, keep that object alive through the XFA
1917            // pipeline. PDF 1.7 §5.5 defines those widths as the authoritative
1918            // simple-font metrics, and XFA 3.3 §11.7.1 relies on those metrics
1919            // for field fitting.
1920            store_font_data(
1921                &mut fonts,
1922                &base_font,
1923                Vec::new(),
1924                pdf_widths.clone(),
1925                pdf_encoding.clone(),
1926                Some(source_font),
1927            );
1928        }
1929    }
1930    fonts
1931}
1932
1933/// Extract /FirstChar, /LastChar, and /Widths from a font dictionary.
1934fn extract_font_widths(dict: &lopdf::Dictionary) -> Option<(u16, Vec<u16>)> {
1935    let first_char = dict.get(b"FirstChar").ok()?.as_i64().ok()? as u16;
1936    let _last_char = dict.get(b"LastChar").ok()?.as_i64().ok()? as u16;
1937    let widths_array = dict.get(b"Widths").ok()?.as_array().ok()?;
1938    let widths: Vec<u16> = widths_array
1939        .iter()
1940        .filter_map(|w| w.as_i64().ok().map(|v| v as u16))
1941        .collect();
1942    if widths.is_empty() {
1943        return None;
1944    }
1945    Some((first_char, widths))
1946}
1947
1948/// Extract CID font widths from a Type0 (composite) font's `/W` array.
1949///
1950/// CID fonts (PDF spec §9.7.4.3, Table 114) use a different width format than
1951/// simple fonts. Instead of `/FirstChar` + `/Widths`, they use a `/W` array in
1952/// the CIDFont descendant dictionary with two element types:
1953///
1954///   `cid_start [w1 w2 w3 ...]`   — consecutive CIDs starting at cid_start
1955///   `cid_first cid_last width`   — range of CIDs all sharing the same width
1956///
1957/// `/DW` (default width, defaults to 1000) applies to CIDs not listed in `/W`.
1958///
1959/// The result is converted to the same `(first_char, widths)` representation
1960/// used by simple fonts, where `widths[cid - first_char]` gives the width.
1961///
1962/// LIMITATION: CID-to-Unicode mapping via ToUnicode CMap is not parsed here;
1963/// the widths are indexed by raw CID values.
1964fn extract_cid_font_widths(
1965    doc: &Document,
1966    type0_dict: &lopdf::Dictionary,
1967) -> Option<(u16, Vec<u16>)> {
1968    let descendants = type0_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
1969    let desc_ref = descendants.first()?;
1970    let cid_dict = match desc_ref {
1971        Object::Reference(id) => doc.get_dictionary(*id).ok()?,
1972        Object::Dictionary(d) => d,
1973        _ => return None,
1974    };
1975
1976    let default_width = cid_dict
1977        .get(b"DW")
1978        .ok()
1979        .and_then(|o| o.as_i64().ok())
1980        .unwrap_or(1000) as u16;
1981
1982    let w_array = cid_dict.get(b"W").ok()?;
1983    let w_array = match resolve_object(doc, w_array) {
1984        Some(obj) => obj.as_array().ok()?,
1985        None => return None,
1986    };
1987
1988    if w_array.is_empty() {
1989        return None;
1990    }
1991
1992    // First pass: collect all (cid, width) pairs to find bounds.
1993    let mut entries: Vec<(u16, u16)> = Vec::new();
1994    let mut i = 0;
1995    while i < w_array.len() {
1996        let cid_start = match w_array[i].as_i64() {
1997            Ok(v) => v as u16,
1998            Err(_) => {
1999                i += 1;
2000                continue;
2001            }
2002        };
2003        i += 1;
2004        if i >= w_array.len() {
2005            break;
2006        }
2007
2008        // Next element: array → consecutive widths, integer → range end
2009        if let Ok(widths_arr) = w_array[i].as_array() {
2010            // Format: cid_start [w1 w2 w3 ...]
2011            for (j, w_obj) in widths_arr.iter().enumerate() {
2012                if let Ok(w) = w_obj.as_i64() {
2013                    entries.push((cid_start + j as u16, w as u16));
2014                }
2015            }
2016            i += 1;
2017        } else if let Ok(cid_last) = w_array[i].as_i64() {
2018            // Format: cid_first cid_last width
2019            i += 1;
2020            if i >= w_array.len() {
2021                break;
2022            }
2023            if let Ok(width) = w_array[i].as_i64() {
2024                let cid_last = cid_last as u16;
2025                for cid in cid_start..=cid_last {
2026                    entries.push((cid, width as u16));
2027                }
2028            }
2029            i += 1;
2030        } else {
2031            i += 1;
2032        }
2033    }
2034
2035    if entries.is_empty() {
2036        return None;
2037    }
2038
2039    // SAFETY: entries is non-empty (guarded above), so min/max always yield Some.
2040    let min_cid = entries
2041        .iter()
2042        .map(|(c, _)| *c)
2043        .min()
2044        .expect("entries is non-empty");
2045    let max_cid = entries
2046        .iter()
2047        .map(|(c, _)| *c)
2048        .max()
2049        .expect("entries is non-empty");
2050    let len = (max_cid - min_cid + 1) as usize;
2051    let mut widths = vec![default_width; len];
2052    for (cid, w) in &entries {
2053        widths[(*cid - min_cid) as usize] = *w;
2054    }
2055
2056    Some((min_cid, widths))
2057}
2058
2059/// Parse a simple-font `/Encoding` dictionary with `/Differences`.
2060///
2061/// WHY: Custom encodings via `/Differences` are essential for correct glyph
2062/// width mapping. Without this, widths are indexed against the wrong
2063/// characters and text wrapping breaks for fonts that deviate from WinAnsi.
2064///
2065/// SPEC: PDF spec §9.6.6.1 defines `/Differences` as an alternating array of
2066/// starting code integers and glyph names applied on top of a base encoding.
2067///
2068/// LIMITATION: CID fonts (`/Type0`) use CMaps and `/W` arrays instead of this
2069/// simple-font encoding mechanism, so they intentionally return `None` here.
2070fn extract_font_encoding(doc: &Document, dict: &lopdf::Dictionary) -> Option<PdfSimpleEncoding> {
2071    let encoding_obj = resolve_object(doc, dict.get(b"Encoding").ok()?)?;
2072    let encoding_dict = encoding_obj.as_dict().ok()?;
2073    let differences_array = resolve_object(doc, encoding_dict.get(b"Differences").ok()?)?
2074        .as_array()
2075        .ok()?;
2076
2077    let base_encoding = encoding_dict
2078        .get(b"BaseEncoding")
2079        .ok()
2080        .and_then(|obj| resolve_object(doc, obj))
2081        .and_then(|obj| obj.as_name().ok())
2082        .and_then(PdfBaseEncoding::from_pdf_name)
2083        .unwrap_or(PdfBaseEncoding::WinAnsi);
2084
2085    let mut differences = Vec::new();
2086    let mut current_code: Option<u8> = None;
2087    for item in differences_array {
2088        let item = resolve_object(doc, item)?;
2089        if let Ok(code) = item.as_i64() {
2090            current_code = u8::try_from(code).ok();
2091            continue;
2092        }
2093
2094        let Some(name) = item.as_name().ok() else {
2095            continue;
2096        };
2097        let Some(code) = current_code else {
2098            continue;
2099        };
2100        let Some(glyph_name) = std::str::from_utf8(name).ok() else {
2101            continue;
2102        };
2103        if let Some(unicode) = pdf_glyph_name_to_unicode(glyph_name) {
2104            differences.push((code, unicode));
2105        }
2106        current_code = code.checked_add(1);
2107    }
2108
2109    if differences.is_empty() {
2110        return None;
2111    }
2112
2113    Some(PdfSimpleEncoding {
2114        base_encoding,
2115        differences,
2116    })
2117}
2118
2119fn extract_simple_pdf_source_font(
2120    doc: &Document,
2121    font_object_id: ObjectId,
2122    dict: &lopdf::Dictionary,
2123    pdf_widths: Option<&(u16, Vec<u16>)>,
2124) -> Option<PdfSourceFont> {
2125    pdf_widths?;
2126
2127    let subtype = dict.get(b"Subtype").ok().and_then(|obj| obj.as_name().ok());
2128    if subtype == Some(b"Type0".as_slice()) {
2129        return None;
2130    }
2131
2132    // fix(#811): only reuse simple fonts whose emitted PDF text can stay on
2133    // the current WinAnsi path in render_bridge. Fonts with custom encodings
2134    // need a dedicated byte encoder first; otherwise we would preserve widths
2135    // but emit the wrong character codes.
2136    //
2137    // PDF 1.7 §5.5 defines simple-font widths in the font's encoding space.
2138    // LIMITATION: CID/Type0 fonts use /W arrays and CMaps instead; they are
2139    // intentionally excluded here.
2140    let encoding_obj = dict
2141        .get(b"Encoding")
2142        .ok()
2143        .and_then(|obj| resolve_object(doc, obj));
2144    match encoding_obj {
2145        Some(obj) if obj.as_name().ok() == Some(b"WinAnsiEncoding".as_slice()) => {}
2146        Some(obj) => {
2147            let base = obj
2148                .as_dict()
2149                .ok()
2150                .and_then(|enc| enc.get(b"BaseEncoding").ok())
2151                .and_then(|base| resolve_object(doc, base))
2152                .and_then(|base| base.as_name().ok());
2153            if base != Some(b"WinAnsiEncoding".as_slice()) {
2154                return None;
2155            }
2156            if obj
2157                .as_dict()
2158                .ok()
2159                .and_then(|enc| enc.get(b"Differences").ok())
2160                .is_some()
2161            {
2162                return None;
2163            }
2164        }
2165        None => return None,
2166    }
2167
2168    Some(PdfSourceFont {
2169        object_id: font_object_id,
2170    })
2171}
2172
2173fn resolve_object<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
2174    match obj {
2175        Object::Reference(id) => doc.get_object(*id).ok(),
2176        other => Some(other),
2177    }
2178}
2179
2180/// Extract font data from a direct FontDescriptor (FontFile2/3/1 in FontDescriptor).
2181fn extract_font_from_direct_fd(
2182    doc: &Document,
2183    font_dict: &lopdf::Dictionary,
2184    _base_font: &str,
2185) -> Option<(lopdf::ObjectId, Vec<u8>)> {
2186    let fd_id = font_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
2187    let fd = doc.get_dictionary(fd_id).ok()?;
2188
2189    let font_stream_id = fd
2190        .get(b"FontFile2")
2191        .or_else(|_| fd.get(b"FontFile3"))
2192        .or_else(|_| fd.get(b"FontFile"))
2193        .ok()?
2194        .as_reference()
2195        .ok()?;
2196
2197    let stream = doc
2198        .get_object(font_stream_id)
2199        .and_then(|o| o.as_stream())
2200        .ok()?;
2201
2202    let data = stream
2203        .get_plain_content()
2204        .unwrap_or_else(|_| stream.content.clone());
2205
2206    if data.is_empty() {
2207        return None;
2208    }
2209
2210    Some((font_stream_id, data))
2211}
2212
2213/// Extract font data from CIDFont Type0's DescendantFonts path.
2214///
2215/// CIDFont Type0 fonts have their font data in:
2216///   /DescendantFonts[n] /CIDFont /FontDescriptor /FontFile*
2217fn extract_cidfont_data(
2218    doc: &Document,
2219    font_dict: &lopdf::Dictionary,
2220    _base_font: &str,
2221    seen: &std::collections::HashSet<lopdf::ObjectId>,
2222) -> Option<(lopdf::ObjectId, Vec<u8>)> {
2223    // Check if this is a Type0 (composite) font by looking for DescendantFonts
2224    let descendants = font_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
2225
2226    // Iterate through descendant CIDFonts
2227    for desc_ref in descendants {
2228        let desc_id = desc_ref.as_reference().ok()?;
2229        let desc_dict = doc.get_dictionary(desc_id).ok()?;
2230
2231        // Check if this descendant is a CIDFont (has FontDescriptor)
2232        let fd_id = desc_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
2233        let fd = doc.get_dictionary(fd_id).ok()?;
2234
2235        // Try FontFile3 first (CFF font for CIDFontType0C), then FontFile2 (TrueType)
2236        let font_stream_id = fd
2237            .get(b"FontFile3")
2238            .or_else(|_| fd.get(b"FontFile2"))
2239            .or_else(|_| fd.get(b"FontFile"))
2240            .ok()?
2241            .as_reference()
2242            .ok()?;
2243
2244        if seen.contains(&font_stream_id) {
2245            continue;
2246        }
2247
2248        let stream = doc
2249            .get_object(font_stream_id)
2250            .and_then(|o| o.as_stream())
2251            .ok()?;
2252
2253        let data = stream
2254            .get_plain_content()
2255            .unwrap_or_else(|_| stream.content.clone());
2256
2257        if !data.is_empty() {
2258            return Some((font_stream_id, data));
2259        }
2260    }
2261    None
2262}
2263
2264/// Store font data under multiple names (PostScript name, family name, normalized name).
2265fn store_font_data(
2266    fonts: &mut Vec<EmbeddedFontData>,
2267    base_font: &str,
2268    data: Vec<u8>,
2269    pdf_widths: Option<(u16, Vec<u16>)>,
2270    pdf_encoding: Option<PdfSimpleEncoding>,
2271    pdf_source_font: Option<PdfSourceFont>,
2272) {
2273    let clean_name = if let Some(pos) = base_font.find('+') {
2274        base_font[pos + 1..].to_string()
2275    } else {
2276        base_font.to_string()
2277    };
2278    let allow_family_alias = family_alias_is_regular_face(&clean_name, &data);
2279
2280    // Store under the PostScript name (subset prefix already stripped)
2281    fonts.push(EmbeddedFontData {
2282        name: clean_name.clone(),
2283        data: data.clone(),
2284        pdf_widths: pdf_widths.clone(),
2285        pdf_encoding: pdf_encoding.clone(),
2286        pdf_source_font,
2287    });
2288
2289    // Store additional aliases from the font name table. The bare family name
2290    // (e.g. "Arial") is only attached to the regular face so a normal-weight
2291    // XFA request does not get hijacked by a bold/italic variant.
2292    if let Ok(face) = ttf_parser::Face::parse(&data, 0) {
2293        for name_record in face.names() {
2294            let allow_alias = match name_record.name_id {
2295                ttf_parser::name_id::FAMILY => allow_family_alias,
2296                ttf_parser::name_id::FULL_NAME | ttf_parser::name_id::POST_SCRIPT_NAME => true,
2297                _ => false,
2298            };
2299            if !allow_alias {
2300                continue;
2301            }
2302            if let Some(alias) = name_record.to_string() {
2303                if alias != clean_name {
2304                    fonts.push(EmbeddedFontData {
2305                        name: alias,
2306                        data: data.clone(),
2307                        pdf_widths: pdf_widths.clone(),
2308                        pdf_encoding: pdf_encoding.clone(),
2309                        pdf_source_font,
2310                    });
2311                }
2312            }
2313        }
2314    }
2315
2316    // Common PostScript-to-family normalization as fallback. As with the name
2317    // table family alias above, reserve the bare family name for the regular
2318    // face so `Arial` resolves to `ArialMT` rather than `Arial-BoldMT`.
2319    let normalized = ps_name_to_family(&clean_name);
2320    if allow_family_alias && normalized != clean_name {
2321        fonts.push(EmbeddedFontData {
2322            name: normalized,
2323            data,
2324            pdf_widths,
2325            pdf_encoding,
2326            pdf_source_font,
2327        });
2328    }
2329}
2330
2331fn family_alias_is_regular_face(clean_name: &str, data: &[u8]) -> bool {
2332    if let Ok(face) = ttf_parser::Face::parse(data, 0) {
2333        if face.is_bold() || face.is_italic() {
2334            return false;
2335        }
2336    }
2337
2338    let lower = clean_name.to_ascii_lowercase();
2339    !lower.contains("bold") && !lower.contains("italic") && !lower.contains("oblique")
2340}
2341
2342/// Convert a PostScript font name to its likely family name.
2343///
2344/// Examples: `ArialMT` → `Arial`, `TimesNewRomanPSMT` → `Times New Roman`,
2345/// `MyriadPro-Regular` → `Myriad Pro`.
2346fn ps_name_to_family(ps_name: &str) -> String {
2347    // Strip weight/style suffixes first
2348    let base = ps_name
2349        .strip_suffix("PSMT")
2350        .or_else(|| ps_name.strip_suffix("PS-BoldItalicMT"))
2351        .or_else(|| ps_name.strip_suffix("PS-BoldMT"))
2352        .or_else(|| ps_name.strip_suffix("PS-ItalicMT"))
2353        .or_else(|| ps_name.strip_suffix("-BoldItalicMT"))
2354        .or_else(|| ps_name.strip_suffix("-BoldMT"))
2355        .or_else(|| ps_name.strip_suffix("-ItalicMT"))
2356        .or_else(|| ps_name.strip_suffix("MT"))
2357        .or_else(|| ps_name.strip_suffix("-Regular"))
2358        .or_else(|| ps_name.strip_suffix("-Bold"))
2359        .or_else(|| ps_name.strip_suffix("-Italic"))
2360        .or_else(|| ps_name.strip_suffix("-BoldItalic"))
2361        .unwrap_or(ps_name);
2362    // Insert spaces before uppercase letters that follow a lowercase letter
2363    // e.g. "TimesNewRoman" → "Times New Roman", "MyriadPro" → "Myriad Pro"
2364    let mut result = String::with_capacity(base.len() + 4);
2365    for (i, ch) in base.chars().enumerate() {
2366        if i > 0 && ch.is_uppercase() {
2367            let prev = base.as_bytes()[i - 1] as char;
2368            if prev.is_lowercase() {
2369                result.push(' ');
2370            }
2371        }
2372        result.push(ch);
2373    }
2374    result
2375}
2376
2377/// Collected font specification from the XFA template.
2378struct TemplateFontEntry {
2379    typeface: String,
2380    weight: Option<String>,
2381    posture: Option<String>,
2382    generic_family: Option<String>,
2383}
2384
2385fn collect_template_font_entries(template_xml: &str) -> Vec<TemplateFontEntry> {
2386    let mut entries = Vec::new();
2387    let mut seen = std::collections::HashSet::new();
2388    if let Ok(xml_doc) = roxmltree::Document::parse(template_xml) {
2389        for node in xml_doc.descendants() {
2390            if node.tag_name().name() == "font" {
2391                if let Some(typeface) = node.attribute("typeface") {
2392                    let name = typeface.to_string();
2393                    let weight = node.attribute("weight").map(|s| s.to_string());
2394                    let posture = node.attribute("posture").map(|s| s.to_string());
2395                    let generic_family = node.attribute("genericFamily").map(|s| s.to_string());
2396                    let key = font_variant_key(&name, weight.as_deref(), posture.as_deref());
2397                    if !name.is_empty() && seen.insert(key.to_lowercase()) {
2398                        entries.push(TemplateFontEntry {
2399                            typeface: name,
2400                            weight,
2401                            posture,
2402                            generic_family,
2403                        });
2404                    }
2405                }
2406            }
2407        }
2408    }
2409    entries
2410}
2411
2412fn embed_font_in_pdf(doc: &mut Document, font: &ResolvedFont) -> ObjectId {
2413    let font_stream = Stream::new(
2414        dictionary! {
2415            "Length" => Object::Integer(font.data.len() as i64),
2416            "Length1" => Object::Integer(font.data.len() as i64)
2417        },
2418        font.data.clone(),
2419    );
2420    let font_file_id = doc.add_object(Object::Stream(font_stream));
2421
2422    let upem = font.units_per_em as f64;
2423    let scale = 1000.0 / upem.max(1.0);
2424    let ascent = (font.ascender as f64 * scale) as i64;
2425    let descent = (font.descender as f64 * scale) as i64;
2426    let cap_height = (ascent as f64 * 0.7) as i64;
2427    let base_name = font.name.replace(' ', "-");
2428
2429    let fd = dictionary! {
2430        "Type" => Object::Name(b"FontDescriptor".to_vec()),
2431        "FontName" => Object::Name(base_name.as_bytes().to_vec()),
2432        "Flags" => Object::Integer(32),
2433        "FontBBox" => Object::Array(vec![
2434            Object::Integer(0),
2435            Object::Integer(descent),
2436            Object::Integer(1000),
2437            Object::Integer(ascent),
2438        ]),
2439        "ItalicAngle" => Object::Integer(0),
2440        "Ascent" => Object::Integer(ascent),
2441        "Descent" => Object::Integer(descent),
2442        "CapHeight" => Object::Integer(cap_height),
2443        "StemV" => Object::Integer(80),
2444        "FontFile2" => Object::Reference(font_file_id)
2445    };
2446    let fd_id = doc.add_object(Object::Dictionary(fd));
2447
2448    // Build CID font data for Identity-H encoding.
2449    let cid_info = font.cid_font_info().unwrap_or(CidFontInfo {
2450        widths: vec![500],
2451        gid_to_unicode: vec![],
2452    });
2453
2454    // /W array: [ 0 [w0 w1 w2 ... wN] ]
2455    let widths_inner: Vec<Object> = cid_info
2456        .widths
2457        .iter()
2458        .map(|&w| Object::Integer(w as i64))
2459        .collect();
2460    let w_array = vec![Object::Integer(0), Object::Array(widths_inner)];
2461
2462    let cid_font = dictionary! {
2463        "Type" => Object::Name(b"Font".to_vec()),
2464        "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
2465        "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
2466        "CIDSystemInfo" => Object::Dictionary(dictionary! {
2467            "Registry" => Object::String(b"Adobe".to_vec(), StringFormat::Literal),
2468            "Ordering" => Object::String(b"Identity".to_vec(), StringFormat::Literal),
2469            "Supplement" => Object::Integer(0)
2470        }),
2471        "FontDescriptor" => Object::Reference(fd_id),
2472        "W" => Object::Array(w_array),
2473        "CIDToGIDMap" => Object::Name(b"Identity".to_vec())
2474    };
2475    let cid_font_id = doc.add_object(Object::Dictionary(cid_font));
2476
2477    // ToUnicode CMap for text extraction / copy-paste.
2478    let tounicode_data = generate_tounicode_cmap(&cid_info.gid_to_unicode);
2479    let tounicode_stream = Stream::new(
2480        dictionary! { "Length" => Object::Integer(tounicode_data.len() as i64) },
2481        tounicode_data,
2482    );
2483    let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
2484
2485    // Type0 (composite) font with Identity-H encoding.
2486    let type0_font = dictionary! {
2487        "Type" => Object::Name(b"Font".to_vec()),
2488        "Subtype" => Object::Name(b"Type0".to_vec()),
2489        "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
2490        "Encoding" => Object::Name(b"Identity-H".to_vec()),
2491        "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
2492        "ToUnicode" => Object::Reference(tounicode_id)
2493    };
2494    doc.add_object(Object::Dictionary(type0_font))
2495}
2496
2497/// Generate a ToUnicode CMap stream mapping glyph IDs to Unicode codepoints.
2498fn generate_tounicode_cmap(gid_to_unicode: &[(u16, char)]) -> Vec<u8> {
2499    let mut cmap = String::with_capacity(gid_to_unicode.len() * 24 + 256);
2500    cmap.push_str("/CIDInit /ProcSet findresource begin\n");
2501    cmap.push_str("12 dict begin\n");
2502    cmap.push_str("begincmap\n");
2503    cmap.push_str("/CIDSystemInfo\n");
2504    cmap.push_str("<< /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def\n");
2505    cmap.push_str("/CMapName /Adobe-Identity-UCS def\n");
2506    cmap.push_str("/CMapType 2 def\n");
2507    cmap.push_str("1 begincodespacerange\n");
2508    cmap.push_str("<0000> <FFFF>\n");
2509    cmap.push_str("endcodespacerange\n");
2510    for chunk in gid_to_unicode.chunks(100) {
2511        let _ = writeln!(cmap, "{} beginbfchar", chunk.len());
2512        for &(gid, ch) in chunk {
2513            let _ = writeln!(cmap, "<{:04X}> <{:04X}>", gid, ch as u32);
2514        }
2515        cmap.push_str("endbfchar\n");
2516    }
2517    cmap.push_str("endcmap\n");
2518    cmap.push_str("CMapName currentdict /CMap defineresource pop\n");
2519    cmap.push_str("end\nend\n");
2520    cmap.into_bytes()
2521}
2522
2523/// Resolve all fonts referenced in the XFA template without embedding them.
2524///
2525/// Returns a map from variant key to `ResolvedFont`. The key encodes typeface,
2526/// weight, and posture so that "Arial bold" and "Arial regular" are resolved
2527/// separately. Called BEFORE layout so that resolved metrics can be injected
2528/// into the `FormTree`.
2529fn resolve_template_fonts(template_xml: &str, pdf_bytes: &[u8]) -> HashMap<String, ResolvedFont> {
2530    let mut resolved = HashMap::new();
2531    let entries = collect_template_font_entries(template_xml);
2532    if entries.is_empty() {
2533        return resolved;
2534    }
2535    let source_doc = match Document::load_mem(pdf_bytes) {
2536        Ok(d) => d,
2537        Err(_) => return resolved,
2538    };
2539    let embedded_fonts = extract_embedded_fonts(&source_doc);
2540    let mut resolver = XfaFontResolver::new(embedded_fonts);
2541    for entry in &entries {
2542        let spec = XfaFontSpec::from_xfa_attrs(
2543            &entry.typeface,
2544            entry.weight.as_deref(),
2545            entry.posture.as_deref(),
2546            None,
2547            entry.generic_family.as_deref(),
2548        );
2549        let key = font_variant_key(
2550            &entry.typeface,
2551            entry.weight.as_deref(),
2552            entry.posture.as_deref(),
2553        );
2554        match resolver.resolve(&spec) {
2555            Ok(font) => {
2556                resolved.insert(key, font);
2557            }
2558            Err(e) => {
2559                eprintln!("Font resolution failed for '{}': {}", entry.typeface, e);
2560            }
2561        }
2562    }
2563    resolved
2564}
2565
2566/// Inject resolved font metrics into the FormTree before layout.
2567///
2568/// For each node whose style metadata carries a `font_family`, looks up the
2569/// matching `ResolvedFont` (using the variant key that includes weight/posture)
2570/// and populates the `resolved_widths`, `resolved_upem`, `resolved_ascender`,
2571/// and `resolved_descender` fields on the node's `FontMetrics`.
2572/// This makes `measure_width()` and `line_height_pt()` in the layout engine use
2573/// actual font data instead of generic AFM tables.
2574fn inject_resolved_metrics(
2575    tree: &mut xfa_layout_engine::form::FormTree,
2576    resolved: &HashMap<String, ResolvedFont>,
2577) {
2578    for i in 0..tree.nodes.len() {
2579        let id = xfa_layout_engine::form::FormNodeId(i);
2580        let style = &tree.meta(id).style;
2581        let font_family = style.font_family.clone();
2582        let font_weight = style.font_weight.clone();
2583        let font_style = style.font_style.clone();
2584        if let Some(ref family) = font_family {
2585            // Try variant-specific key first, then fall back to base key.
2586            let variant_key =
2587                font_variant_key(family, font_weight.as_deref(), font_style.as_deref());
2588            let base_key = font_variant_key(family, None, None);
2589            let font = resolved
2590                .get(&variant_key)
2591                .or_else(|| resolved.get(&base_key));
2592            if let Some(font) = font {
2593                let (_first_char, widths) = font.pdf_glyph_widths();
2594                let node = tree.get_mut(id);
2595                node.font.resolved_widths = Some(widths);
2596                node.font.resolved_upem = Some(font.units_per_em);
2597                node.font.resolved_ascender = Some(font.ascender);
2598                node.font.resolved_descender = Some(font.descender);
2599            }
2600        }
2601    }
2602}
2603
2604/// Embed already-resolved fonts into the PDF document.
2605///
2606/// Called AFTER layout. Returns the font_map (typeface -> PDF resource name),
2607/// the font objects for page resources, and the metrics data for render_bridge.
2608fn simple_encoding_unicode_to_code_map(encoding: &PdfSimpleEncoding) -> HashMap<u16, u8> {
2609    let mut map = HashMap::new();
2610    for (code, unicode) in encoding.code_to_unicode_table().into_iter().enumerate() {
2611        if let Some(cp) = unicode {
2612            map.entry(cp).or_insert(code as u8);
2613        }
2614    }
2615    map
2616}
2617
2618fn add_text_chars_for_font(
2619    chars_by_font: &mut HashMap<String, HashSet<char>>,
2620    font_family: Option<&str>,
2621    font_weight: Option<&str>,
2622    font_style: Option<&str>,
2623    text: &str,
2624) {
2625    let Some(family) = font_family else {
2626        return;
2627    };
2628    if text.is_empty() {
2629        return;
2630    }
2631    let chars: Vec<char> = text.chars().filter(|c| !c.is_control()).collect();
2632    if chars.is_empty() {
2633        return;
2634    }
2635
2636    let variant = font_variant_key(family, font_weight, font_style);
2637    chars_by_font
2638        .entry(variant)
2639        .or_default()
2640        .extend(chars.iter().copied());
2641    chars_by_font
2642        .entry(family.to_string())
2643        .or_default()
2644        .extend(chars);
2645}
2646
2647fn add_text_chars_for_style(
2648    chars_by_font: &mut HashMap<String, HashSet<char>>,
2649    style: &FormNodeStyle,
2650    text: &str,
2651) {
2652    add_text_chars_for_font(
2653        chars_by_font,
2654        style.font_family.as_deref(),
2655        style.font_weight.as_deref(),
2656        style.font_style.as_deref(),
2657        text,
2658    );
2659}
2660
2661fn collect_used_chars_from_layout_node(
2662    node: &LayoutNode,
2663    chars_by_font: &mut HashMap<String, HashSet<char>>,
2664) {
2665    match &node.content {
2666        LayoutContent::Text(t) => add_text_chars_for_style(chars_by_font, &node.style, t),
2667        LayoutContent::Field { value, .. } => {
2668            add_text_chars_for_style(chars_by_font, &node.style, value)
2669        }
2670        LayoutContent::WrappedText { lines, .. } => {
2671            for line in lines {
2672                add_text_chars_for_style(chars_by_font, &node.style, line);
2673            }
2674        }
2675        LayoutContent::Draw(DrawContent::Text(t)) => {
2676            add_text_chars_for_style(chars_by_font, &node.style, t)
2677        }
2678        _ => {}
2679    }
2680
2681    if let Some(caption) = &node.style.caption_text {
2682        add_text_chars_for_style(chars_by_font, &node.style, caption);
2683    }
2684
2685    if let Some(spans) = &node.style.rich_text_spans {
2686        for span in spans {
2687            add_text_chars_for_font(
2688                chars_by_font,
2689                span.font_family
2690                    .as_deref()
2691                    .or(node.style.font_family.as_deref()),
2692                span.font_weight
2693                    .as_deref()
2694                    .or(node.style.font_weight.as_deref()),
2695                span.font_style
2696                    .as_deref()
2697                    .or(node.style.font_style.as_deref()),
2698                &span.text,
2699            );
2700        }
2701    }
2702
2703    for child in &node.children {
2704        collect_used_chars_from_layout_node(child, chars_by_font);
2705    }
2706}
2707
2708fn collect_used_chars_by_font(layout: &LayoutDom) -> HashMap<String, HashSet<char>> {
2709    let mut chars_by_font = HashMap::new();
2710    for page in &layout.pages {
2711        for node in &page.nodes {
2712            collect_used_chars_from_layout_node(node, &mut chars_by_font);
2713        }
2714    }
2715    chars_by_font
2716}
2717
2718fn simple_font_can_encode_char(font: &ResolvedFont, ch: char) -> bool {
2719    if ch.is_ascii() {
2720        return true;
2721    }
2722    if let Some(encoding) = &font.pdf_encoding {
2723        let Ok(cp) = u16::try_from(ch as u32) else {
2724            return false;
2725        };
2726        return encoding
2727            .code_to_unicode_table()
2728            .into_iter()
2729            .flatten()
2730            .any(|u| u == cp);
2731    }
2732    unicode_to_winansi(ch).is_some()
2733}
2734
2735fn variant_key_base_name(key: &str) -> Option<&str> {
2736    key.strip_suffix("_Bold_Italic")
2737        .or_else(|| key.strip_suffix("_Bold_Normal"))
2738        .or_else(|| key.strip_suffix("_Normal_Italic"))
2739        .or_else(|| key.strip_suffix("_Normal_Normal"))
2740}
2741
2742#[allow(clippy::type_complexity)]
2743fn embed_resolved_fonts(
2744    doc: &mut Document,
2745    resolved: &HashMap<String, ResolvedFont>,
2746    layout: &LayoutDom,
2747) -> (
2748    HashMap<String, String>,
2749    Vec<(String, ObjectId)>,
2750    HashMap<String, FontMetricsData>,
2751) {
2752    let mut font_map = HashMap::new();
2753    let mut font_objects = Vec::new();
2754    let mut metrics_data = HashMap::new();
2755    let used_chars_by_font = collect_used_chars_by_font(layout);
2756    for (idx, (name, font)) in resolved.iter().enumerate() {
2757        let resource_name = format!("XFA_F{}", idx);
2758        // fix(#811): once a simple source font survives resolution, keep using
2759        // the original PDF object instead of emitting a synthetic Type0/system
2760        // fallback. That keeps field-fit behaviour aligned with the source PDF
2761        // and Acrobat's interpretation of the same /Widths table.
2762        //
2763        // WHY: custom encodings and non-ASCII content can require Unicode
2764        // shaping through Identity-H. If a simple source font cannot encode
2765        // the actual text in layout output, reusing it would produce '?'
2766        // substitutions in content streams.
2767        //
2768        // LIMITATION: CID source fonts (/Type0 with /W arrays) use a different
2769        // mechanism and are not covered by this simple-font encodeability gate.
2770        let used_chars = used_chars_by_font
2771            .get(name)
2772            .or_else(|| used_chars_by_font.get(&font.name))
2773            .or_else(|| variant_key_base_name(name).and_then(|base| used_chars_by_font.get(base)));
2774        let source_can_encode_all_text = used_chars.is_none_or(|chars| {
2775            chars
2776                .iter()
2777                .all(|ch| simple_font_can_encode_char(font, *ch))
2778        });
2779        let (obj_id, render_font_data) = if let Some(source_font) = font.pdf_source_font {
2780            if source_can_encode_all_text || font.data.is_empty() {
2781                (source_font.object_id, None)
2782            } else {
2783                (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2784            }
2785        } else {
2786            (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2787        };
2788        font_map.insert(name.clone(), format!("/{}", resource_name));
2789        font_objects.push((resource_name, obj_id));
2790        let (_first_char, widths) = font.pdf_glyph_widths();
2791        metrics_data.insert(
2792            name.clone(),
2793            FontMetricsData {
2794                widths,
2795                upem: font.units_per_em,
2796                ascender: font.ascender,
2797                descender: font.descender,
2798                font_data: render_font_data,
2799                face_index: font.face_index,
2800                simple_unicode_to_code: font
2801                    .pdf_encoding
2802                    .as_ref()
2803                    .map(simple_encoding_unicode_to_code_map),
2804            },
2805        );
2806    }
2807    (font_map, font_objects, metrics_data)
2808}
2809
2810/// Fallback: preserve existing page content, strip AcroForm/widgets only.
2811/// If lopdf can't parse the PDF (corrupt xref), return the original bytes
2812/// unchanged — the PDF is too corrupt for us to modify but still renderable.
2813///
2814/// This function ALWAYS returns Ok — errors are logged but the original bytes
2815/// are always returned as a last resort.
2816fn static_fallback(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
2817    let mut doc = match Document::load_mem(pdf_bytes) {
2818        Ok(d) => d,
2819        Err(e) => {
2820            eprintln!("static_fallback: lopdf load failed ({e}), returning original bytes");
2821            return Ok(pdf_bytes.to_vec());
2822        }
2823    };
2824    strip_widgets_and_acroform(&mut doc);
2825    javascript_policy::strip_javascript_for_flatten(&mut doc);
2826    let mut out = Vec::new();
2827    if let Err(e) = doc.save_to(&mut out) {
2828        eprintln!("static_fallback: save failed ({e}), returning original bytes");
2829        return Ok(pdf_bytes.to_vec());
2830    }
2831    Ok(out)
2832}
2833
2834// Epic A E-5: thread-local for match-failure accumulation inside the local
2835// `apply_recursive` fn (which cannot capture outer state).
2836std::thread_local! {
2837    static FORM_DOM_MATCH_LOG: std::cell::RefCell<Option<Vec<FormDomMatchEntry>>> =
2838        const { std::cell::RefCell::new(None) };
2839}
2840
2841/// Apply presence overrides and repeating-instance expansion from the XFA form
2842/// DOM packet.
2843///
2844/// When an XFA PDF has been opened and saved by Adobe Reader, the form DOM
2845/// captures the runtime state of all nodes after scripts executed.  We walk
2846/// the form DOM and the FormTree in parallel (matching by subform/field name)
2847/// to:
2848///
2849/// 1. Transfer `presence="hidden"` attributes that our script interpreter
2850///    could not compute (e.g. Avoka framework's `sfcUtils.updateVisibility`).
2851/// 2. Replicate repeating subform instances (XFA §4.4.3): when `bind
2852///    match="none"` prevents data-driven expansion, the form DOM records the
2853///    correct instance count produced by the runtime's `instanceManager`.  We
2854///    deep-clone the template instance and populate field values from the form
2855///    DOM so the layout engine produces the right number of pages.
2856///
2857/// Return value: `(admitted_count, match_failures, match_log)`.
2858pub(crate) fn apply_form_dom_presence(
2859    tree: &mut FormTree,
2860    root_id: FormNodeId,
2861    form_xml: &str,
2862    policy: XfaRenderingPolicy,
2863    admit_databound_override: bool,
2864) -> (usize, usize, Vec<FormDomMatchEntry>) {
2865    use xfa_layout_engine::form::{FormNodeType, Presence};
2866
2867    let Ok(doc) = roxmltree::Document::parse(form_xml) else {
2868        return (0, 0, Vec::new());
2869    };
2870
2871    /// Deep-clone a subtree rooted at `src_id`, returning the new root id.
2872    fn clone_subtree(tree: &mut FormTree, src_id: FormNodeId) -> FormNodeId {
2873        let node = tree.get(src_id).clone();
2874        let meta = tree.meta(src_id).clone();
2875        // Temporarily take children out to avoid borrow issues
2876        let child_ids: Vec<FormNodeId> = node.children.clone();
2877        let mut new_node = node;
2878        new_node.children = Vec::new();
2879        // Clear xfa_id to avoid duplicate id conflicts
2880        let mut new_meta = meta;
2881        new_meta.xfa_id = None;
2882        let new_id = tree.add_node_with_meta(new_node, new_meta);
2883        // Recursively clone children
2884        for &child_id in &child_ids {
2885            let cloned_child = clone_subtree(tree, child_id);
2886            tree.get_mut(new_id).children.push(cloned_child);
2887        }
2888        new_id
2889    }
2890
2891    /// Extract the text content of the first `<value>` child's inner element
2892    /// (e.g. `<value><text>hello</text></value>` → `"hello"`).
2893    fn extract_field_value(xml_field: roxmltree::Node<'_, '_>) -> Option<String> {
2894        let value_el = xml_field
2895            .children()
2896            .find(|c| c.is_element() && c.tag_name().name() == "value")?;
2897        // The inner element may be <text>, <date>, <time>, <float>, etc.
2898        let inner = value_el.children().find(|c| c.is_element())?;
2899        inner.text().map(|t| t.to_string())
2900    }
2901
2902    /// Return true when a FormTree node semantically matches an XML form-DOM
2903    /// child.  Subform/field/draw match by name; pageSet matches by node type
2904    /// (unnamed in the XFA spec); pageArea matches by name within a pageSet.
2905    fn child_matches(tree: &FormTree, fid: FormNodeId, xml_tag: &str, xml_name: &str) -> bool {
2906        use xfa_layout_engine::form::FormNodeType;
2907        let node = tree.get(fid);
2908        match (xml_tag, &node.node_type) {
2909            ("pageSet", FormNodeType::PageSet) => true,
2910            ("pageArea", FormNodeType::PageArea { .. }) => node.name == xml_name,
2911            ("subform", FormNodeType::Subform | FormNodeType::Area | FormNodeType::ExclGroup) => {
2912                node.name == xml_name
2913            }
2914            ("field", FormNodeType::Field { .. }) => node.name == xml_name,
2915            ("draw", FormNodeType::Draw(_) | FormNodeType::Image { .. }) => node.name == xml_name,
2916            _ => false,
2917        }
2918    }
2919
2920    /// Apply presence, values, and child expansion from the form DOM to a
2921    /// FormTree node.
2922    fn apply_recursive(
2923        tree: &mut FormTree,
2924        form_node_id: FormNodeId,
2925        xml_node: roxmltree::Node<'_, '_>,
2926        policy: XfaRenderingPolicy,
2927        admit_databound_override: bool,
2928    ) -> usize {
2929        let mut admitted: usize = 0;
2930        let xml_tag = xml_node.tag_name().name();
2931        if !matches!(
2932            xml_tag,
2933            "subform" | "field" | "form" | "pageSet" | "pageArea"
2934        ) {
2935            return 0;
2936        }
2937
2938        // Apply presence override.
2939        if xml_tag == "subform" || xml_tag == "field" || xml_tag == "pageArea" {
2940            if let Some(pres) = xml_node.attribute("presence") {
2941                if pres == "hidden" {
2942                    // D9 (trace-only, env-gated): provenance of a form-DOM
2943                    // explicit `presence="hidden"` override. Behaviour-neutral.
2944                    if std::env::var("XFA_PRESENCE_PROV").ok().as_deref() == Some("1") {
2945                        eprintln!(
2946                            "XFA_PRESENCE_PROV site=formdom_explicit id={} name={:?} tag={}",
2947                            form_node_id.0,
2948                            tree.get(form_node_id).name,
2949                            xml_tag
2950                        );
2951                    }
2952                    tree.meta_mut(form_node_id).presence = Presence::Hidden;
2953                }
2954            }
2955        }
2956
2957        // Transfer field value from the form DOM when the FormTree node has no
2958        // value yet (empty string) or the form DOM has a computed value.
2959        if xml_tag == "field" {
2960            if let Some(val) = extract_field_value(xml_node) {
2961                if let FormNodeType::Field { ref value, .. } = tree.get(form_node_id).node_type {
2962                    if value.is_empty() {
2963                        tree.get_mut(form_node_id).node_type = FormNodeType::Field { value: val };
2964                    }
2965                }
2966            }
2967            return 0; // fields have no structural children to recurse into
2968        }
2969
2970        // Collect XML children we walk through.
2971        //
2972        // - `subform`, `field`, `draw` are the canonical content children.
2973        // - `pageSet` and `pageArea` carry runtime-allocated page instances
2974        //   that must be mirrored in the FormTree so the layout engine can
2975        //   emit one page per runtime instance (XFA 3.3 §8.6 / §3.1).
2976        let xml_children: Vec<roxmltree::Node<'_, '_>> = xml_node
2977            .children()
2978            .filter(|c| {
2979                c.is_element()
2980                    && matches!(
2981                        c.tag_name().name(),
2982                        "subform" | "field" | "draw" | "pageSet" | "pageArea"
2983                    )
2984            })
2985            .collect();
2986
2987        // Within a `pageSet`, only allow pageArea expansion when the form-DOM
2988        // enumerates a SINGLE pageArea name repeated multiple times — the
2989        // classic "uniform template" pattern recorded by Adobe's runtime when
2990        // it allocates more instances than the template declared (XFA 3.3
2991        // §8.6 / §3.1, e.g. 13275420c3c9afbb: 10× `<pageArea name="Page1">`).
2992        //
2993        // When the pageSet enumerates multiple distinct pageArea names
2994        // (e.g. IRCC forms with `Page1` + `OverFlowPage`), Adobe pre-allocates
2995        // pageAreas as a *menu* of available templates — not every instance is
2996        // actually rendered.  Expanding clones in that case over-paginates.
2997        //
2998        // Implementation: when inside a pageSet AND the pageArea-name set has
2999        // more than one distinct entry, suppress cloning by setting the
3000        // per-group `expansion_allowed` flag to false later.
3001        let inside_page_set = xml_tag == "pageSet";
3002        let uniform_page_area_template = if inside_page_set {
3003            let mut names: Vec<&str> = xml_children
3004                .iter()
3005                .filter(|c| c.tag_name().name() == "pageArea")
3006                .map(|c| c.attribute("name").unwrap_or(""))
3007                .collect();
3008            names.sort_unstable();
3009            names.dedup();
3010            names.len() == 1
3011        } else {
3012            false
3013        };
3014
3015        // Group consecutive XML children by (tag, name) to detect repeating
3016        // instances.  pageSet is unnamed in XFA, so a single `pageSet` group
3017        // is keyed by tag alone; pageArea siblings share `name="Page1"` for
3018        // runtime-allocated pages.
3019        let mut xml_groups: Vec<((&str, &str), Vec<roxmltree::Node<'_, '_>>)> = Vec::new();
3020        for &xc in &xml_children {
3021            let xtag = xc.tag_name().name();
3022            let xname = xc.attribute("name").unwrap_or("");
3023            let key = (xtag, xname);
3024            if let Some(last) = xml_groups.last_mut() {
3025                if last.0 == key {
3026                    last.1.push(xc);
3027                    continue;
3028                }
3029            }
3030            xml_groups.push((key, vec![xc]));
3031        }
3032
3033        // For each group, match against FormTree children, cloning when needed.
3034        let mut form_children = tree.get(form_node_id).children.clone();
3035        let mut used = vec![false; form_children.len()];
3036
3037        for (gkey, group_xml_nodes) in &xml_groups {
3038            let (gtag, gname) = *gkey;
3039            let xml_count = group_xml_nodes.len();
3040
3041            // Count existing FormTree children matching this XML group.
3042            let existing: Vec<(usize, FormNodeId)> = form_children
3043                .iter()
3044                .enumerate()
3045                .filter(|(i, &fid)| !used[*i] && child_matches(tree, fid, gtag, gname))
3046                .map(|(i, &fid)| (i, fid))
3047                .collect();
3048            let existing_count = existing.len();
3049
3050            // Gate per-group cloning:
3051            //
3052            // * pageArea clones may only be created inside a `pageSet` that
3053            //   enumerates a single uniform pageArea template (see comment
3054            //   above).  Otherwise pre-allocated "menu" pageAreas would be
3055            //   replicated and inflate the rendered page count.
3056            // * Other tags (subform/field/draw) follow the existing
3057            //   instance-replication behaviour.
3058            let expansion_allowed = if gtag == "pageArea" {
3059                inside_page_set && uniform_page_area_template
3060            } else {
3061                true
3062            };
3063
3064            // If the form DOM has more instances than the FormTree, clone to match.
3065            if expansion_allowed && xml_count > existing_count && existing_count > 0 {
3066                let template_id = existing[0].1;
3067                // Find insertion position: after the last existing sibling.
3068                // SAFETY: existing_count > 0 is guarded by the enclosing `if`.
3069                let last_existing_idx = existing.last().expect("existing_count > 0").0;
3070                let insert_pos = last_existing_idx + 1;
3071                let clones_needed = xml_count - existing_count;
3072                let mut new_ids = Vec::new();
3073                for _ in 0..clones_needed {
3074                    let cloned = clone_subtree(tree, template_id);
3075                    new_ids.push(cloned);
3076                }
3077                // Insert cloned nodes into the parent's children list
3078                for (offset, new_id) in new_ids.iter().enumerate() {
3079                    form_children.insert(insert_pos + offset, *new_id);
3080                    used.insert(insert_pos + offset, false);
3081                }
3082                // Persist the updated children list
3083                tree.get_mut(form_node_id).children = form_children.clone();
3084            }
3085
3086            // Mark expanded pageAreas as runtime-instantiated so the layout
3087            // engine emits a page per instance and the page-drop filter does
3088            // not discard them.  Only applies when the pageArea expansion is
3089            // active (uniform template inside a pageSet) AND clones were
3090            // actually created — single-template forms (no expansion) keep
3091            // their existing layout semantics.
3092            if gtag == "pageArea" && expansion_allowed && xml_count > existing_count {
3093                let to_mark: Vec<FormNodeId> = form_children
3094                    .iter()
3095                    .copied()
3096                    .filter(|&fid| child_matches(tree, fid, gtag, gname))
3097                    .collect();
3098                for fid in to_mark {
3099                    tree.meta_mut(fid).runtime_instantiated_page = true;
3100                }
3101            }
3102
3103            // Now match each XML node in the group to a FormTree child
3104            for (group_idx, &xc) in group_xml_nodes.iter().enumerate() {
3105                // Find next unmatched FormTree child with the same shape.
3106                let matched = form_children
3107                    .iter()
3108                    .enumerate()
3109                    .skip(if group_idx > 0 {
3110                        // Start searching after the last matched position
3111                        form_children
3112                            .iter()
3113                            .enumerate()
3114                            .rfind(|(i, &fid)| used[*i] && child_matches(tree, fid, gtag, gname))
3115                            .map(|(i, _)| i + 1)
3116                            .unwrap_or(0)
3117                    } else {
3118                        0
3119                    })
3120                    .find(|(i, &fid)| !used[*i] && child_matches(tree, fid, gtag, gname));
3121                if let Some((idx, &fid)) = matched {
3122                    used[idx] = true;
3123                    admitted += apply_recursive(tree, fid, xc, policy, admit_databound_override);
3124                }
3125            }
3126        }
3127
3128        // XFA §3.1: the form DOM represents the runtime-instantiated form.
3129        // Named template subforms NOT present in the form DOM were never
3130        // instantiated by Adobe's runtime (e.g. script-driven conditional
3131        // sections).  Hide them to prevent over-pagination from phantom
3132        // page-level subforms.
3133        //
3134        // Only suppress when the form DOM explicitly lists subform children;
3135        // a sparse form DOM with no structural children means it didn't
3136        // record child state and we should not infer absence.
3137        let has_subform_children = xml_children
3138            .iter()
3139            .any(|c| c.tag_name().name() == "subform");
3140        if has_subform_children {
3141            for (i, &fid) in form_children.iter().enumerate() {
3142                if used[i] {
3143                    continue;
3144                }
3145                let child_node = tree.get(fid);
3146                // Only suppress named subforms — skip pageSet, unnamed
3147                // transparent nodes, draws, fields, and structural elements.
3148                if matches!(child_node.node_type, FormNodeType::Subform)
3149                    && !child_node.name.is_empty()
3150                {
3151                    // Admit data-bound unmatched subforms instead of
3152                    // suppressing them. Two triggers share one guard:
3153                    //  - `FreshMergeExperimental` (experimental policy), or
3154                    //  - `XFA_FORMDOM_ADMIT_DATABOUND=1`, a default-off
3155                    //    production override that lets `SavedStateFaithful`
3156                    //    admit the same set WITHOUT flipping the policy default.
3157                    //
3158                    // Guard (admit only when):
3159                    //  - not template-hidden (`Presence::Hidden` / `Inactive`)
3160                    //  - not a zero-instance prototype placeholder
3161                    //  - has a data-node bound during merge
3162                    //  - did not opt out of binding via `<bind match="none">`
3163                    //
3164                    // The `bound_data_node.is_some()` clause is the
3165                    // over-pagination guard: truly-unmatched NON-data subforms
3166                    // (no bound data node) are never admitted, so the §3.1
3167                    // suppression that protects against phantom page-level
3168                    // subforms still holds. A data-bound subform only adds a
3169                    // page when its admitted content overflows — the layout
3170                    // engine self-regulates; this is not a page-count heuristic.
3171                    let meta = tree.meta(fid);
3172                    let admit_unmatched_databound = (policy
3173                        == XfaRenderingPolicy::FreshMergeExperimental
3174                        || admit_databound_override)
3175                        && !matches!(meta.presence, Presence::Hidden | Presence::Inactive)
3176                        && !meta.is_zero_instance_prototype
3177                        && meta.bound_data_node.is_some()
3178                        && !meta.data_bind_none;
3179
3180                    if std::env::var("XFA_PRESENCE_PROV").ok().as_deref() == Some("1") {
3181                        let site = if admit_unmatched_databound {
3182                            if policy == XfaRenderingPolicy::FreshMergeExperimental {
3183                                "formdom_unmatched_fresh_merge_admitted"
3184                            } else {
3185                                "formdom_unmatched_databound_admitted"
3186                            }
3187                        } else {
3188                            "formdom_unmatched"
3189                        };
3190                        eprintln!(
3191                            "XFA_PRESENCE_PROV site={site} id={} name={:?}",
3192                            fid.0, child_node.name
3193                        );
3194                    }
3195
3196                    if admit_unmatched_databound {
3197                        // Admit — leave presence as-is (Visible by default
3198                        // from the merger).
3199                        admitted += 1;
3200                    } else {
3201                        // SavedStateFaithful without override (or ineligible
3202                        // node): suppress.
3203                        // Epic A E-5: capture name before mutating (borrow order).
3204                        let suppressed_name = child_node.name.clone();
3205                        let suppressed_id = fid.0;
3206                        tree.meta_mut(fid).presence = Presence::Hidden;
3207                        crate::flatten::FORM_DOM_MATCH_LOG.with(|cell| {
3208                            if let Some(ref mut log) = *cell.borrow_mut() {
3209                                if log.len() < 200 {
3210                                    log.push(crate::dynamic::FormDomMatchEntry {
3211                                        template_node_id: suppressed_id,
3212                                        template_node_name: suppressed_name,
3213                                        reason: "formdom_unmatched_suppressed".to_string(),
3214                                    });
3215                                }
3216                            }
3217                        });
3218                    }
3219                }
3220            }
3221        }
3222        admitted
3223    }
3224
3225    // The form DOM root is <form><subform name="...">...</subform></form>
3226    let form_root = doc.root_element();
3227    let form_root_subform = form_root
3228        .children()
3229        .find(|c| c.is_element() && c.tag_name().name() == "subform");
3230
3231    // E-5: arm the thread-local if XFA_RUNTIME_DIAG or XFA_FLATTEN_TRACE.
3232    let diag_on = runtime_diag_enabled() || crate::flatten_trace::enabled();
3233    if diag_on {
3234        FORM_DOM_MATCH_LOG.with(|cell| {
3235            *cell.borrow_mut() = Some(Vec::new());
3236        });
3237    }
3238
3239    let mut total_admitted: usize = 0;
3240    if let Some(xml_root_sf) = form_root_subform {
3241        let root_children = tree.get(root_id).children.clone();
3242        let root_name = xml_root_sf.attribute("name").unwrap_or("");
3243        for &child_id in &root_children {
3244            if tree.get(child_id).name == root_name {
3245                total_admitted += apply_recursive(
3246                    tree,
3247                    child_id,
3248                    xml_root_sf,
3249                    policy,
3250                    admit_databound_override,
3251                );
3252                break;
3253            }
3254        }
3255    }
3256
3257    // E-5: drain.
3258    let match_log = if diag_on {
3259        FORM_DOM_MATCH_LOG.with(|cell| cell.borrow_mut().take().unwrap_or_default())
3260    } else {
3261        Vec::new()
3262    };
3263    let match_failures = match_log.len();
3264    (total_admitted, match_failures, match_log)
3265}
3266
3267/// Tiny PDFs (<1KB) with XFA templates that lack essential elements (subform,
3268/// pageSet) are corrupt stubs. Attempting to flatten these produces blank pages
3269/// instead of preserving the original page content.
3270fn is_corrupt_xfa_template(pdf_size: usize, template_xml: &str) -> bool {
3271    // Only apply to small PDFs — larger files may have legitimate sparse templates.
3272    if pdf_size >= 1024 {
3273        return false;
3274    }
3275    // A valid XFA template must parse and contain at least one subform or pageSet.
3276    match roxmltree::Document::parse(template_xml) {
3277        Ok(doc) => {
3278            let root = doc.root_element();
3279            !root.children().any(|c| {
3280                c.is_element()
3281                    && matches!(c.tag_name().name(), "subform" | "pageSet" | "subformSet")
3282            })
3283        }
3284        Err(_) => true, // Unparseable template is corrupt.
3285    }
3286}
3287
3288/// Strip undefined XML entity references from XFA template/datasets XML.
3289///
3290/// `roxmltree` only supports the five predefined XML entities (lt, gt, amp,
3291/// quot, apos). Some XFA PDFs contain custom entity references like `&xxe;`
3292/// that cause parse failures, so we drop only those references.
3293///
3294/// fixes #812: Adobe-generated XFA packets also contain raw `&` inside
3295/// processing instructions such as `<?renderCache.subset ... "#$%&'()+"?>`
3296/// and `<?renderCache.textRun ... "A. Adjustment & Location" ...?>`.
3297/// Those packets are valid XML because PI payload is opaque text. The old
3298/// implementation deleted everything between `&` and the next `;`, which
3299/// corrupted valid templates before merge and forced the flattener down the
3300/// 1-page static fallback path.
3301///
3302/// XFA Spec 3.3 §8.6 / §8.8 rely on the template reaching the merge/layout
3303/// pipeline intact. CID `/W` handling is unrelated and remains out of scope.
3304fn strip_undefined_xml_entities(xml: &str) -> String {
3305    let predefined = ["lt", "gt", "amp", "quot", "apos"];
3306    let mut result = String::with_capacity(xml.len());
3307    let bytes = xml.as_bytes();
3308    let mut pos = 0;
3309
3310    while let Some(rel_amp_pos) = xml[pos..].find('&') {
3311        let amp_pos = pos + rel_amp_pos;
3312        result.push_str(&xml[pos..amp_pos]);
3313
3314        if let Some((entity_name, next_pos)) = parse_xml_entity_reference(xml, amp_pos) {
3315            // Keep numeric character references (&#123; or &#x1F;) and the
3316            // predefined XML entities. Drop only true named entity references
3317            // that roxmltree cannot resolve.
3318            if entity_name.starts_with('#') || predefined.contains(&entity_name) {
3319                result.push_str(&xml[amp_pos..next_pos]);
3320            }
3321            pos = next_pos;
3322        } else {
3323            // Not an XML entity reference; preserve the raw ampersand.
3324            result.push('&');
3325            pos = amp_pos + 1;
3326        }
3327    }
3328
3329    if pos < bytes.len() {
3330        result.push_str(&xml[pos..]);
3331    }
3332    result
3333}
3334
3335fn parse_xml_entity_reference(xml: &str, amp_pos: usize) -> Option<(&str, usize)> {
3336    let bytes = xml.as_bytes();
3337    let start = amp_pos + 1;
3338    let first = *bytes.get(start)?;
3339
3340    // Numeric character references: &#123; or &#x1F;
3341    if first == b'#' {
3342        let mut idx = start + 1;
3343        if matches!(bytes.get(idx), Some(b'x' | b'X')) {
3344            idx += 1;
3345            let hex_start = idx;
3346            while matches!(
3347                bytes.get(idx),
3348                Some(b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')
3349            ) {
3350                idx += 1;
3351            }
3352            if idx == hex_start || !matches!(bytes.get(idx), Some(b';')) {
3353                return None;
3354            }
3355        } else {
3356            let digits_start = idx;
3357            while matches!(bytes.get(idx), Some(b'0'..=b'9')) {
3358                idx += 1;
3359            }
3360            if idx == digits_start || !matches!(bytes.get(idx), Some(b';')) {
3361                return None;
3362            }
3363        }
3364        return Some((&xml[start..idx], idx + 1));
3365    }
3366
3367    // Named references: &name; where `name` follows XML Name syntax enough to
3368    // distinguish it from raw PI/script/text ampersands.
3369    if !is_xml_name_start(first) {
3370        return None;
3371    }
3372
3373    let mut idx = start + 1;
3374    while let Some(&b) = bytes.get(idx) {
3375        if b == b';' {
3376            return Some((&xml[start..idx], idx + 1));
3377        }
3378        if !is_xml_name_char(b) {
3379            return None;
3380        }
3381        idx += 1;
3382    }
3383    None
3384}
3385
3386fn is_xml_name_start(byte: u8) -> bool {
3387    matches!(byte, b':' | b'_' | b'A'..=b'Z' | b'a'..=b'z')
3388}
3389
3390fn is_xml_name_char(byte: u8) -> bool {
3391    is_xml_name_start(byte) || matches!(byte, b'-' | b'.' | b'0'..=b'9')
3392}
3393
3394// ---------------------------------------------------------------------------
3395// Helpers
3396// ---------------------------------------------------------------------------
3397
3398/// Returns `true` when the PDF's pages already carry substantial static content.
3399///
3400/// An array /Contents entry (multiple streams) or any individual stream larger
3401/// than 200 bytes indicates pre-flattened page content that should be preserved
3402/// rather than replaced by XFA re-rendering. Adobe's default XFA fallback
3403/// page ("Please wait..." / Adobe Reader upgrade text) is explicitly ignored:
3404/// those bytes are not real pre-rendered form content and must not suppress
3405/// XFA flattening.
3406fn pages_have_static_content(doc: &Document) -> bool {
3407    for page_id in doc.page_iter() {
3408        let streams = page_content_streams(doc, page_id);
3409        if streams.is_empty() {
3410            continue;
3411        }
3412
3413        // Count text-drawing operators (Tj/TJ) across all non-placeholder
3414        // content streams for this page. A real pre-rendered form page has
3415        // dozens of text operators; a watermark or evaluation overlay has
3416        // only 1–3. We require ≥5 non-placeholder text operators to
3417        // consider the page as having substantial static content.
3418        let mut text_op_count = 0usize;
3419        for stream in &streams {
3420            if is_xfa_placeholder_stream(stream) || is_watermark_stream(stream) {
3421                continue;
3422            }
3423            text_op_count += count_text_operators(stream);
3424        }
3425
3426        if text_op_count >= 5 {
3427            return true;
3428        }
3429    }
3430    false
3431}
3432
3433fn page_content_streams(doc: &Document, page_id: ObjectId) -> Vec<Vec<u8>> {
3434    let Ok(page_dict) = doc.get_dictionary(page_id) else {
3435        return Vec::new();
3436    };
3437
3438    match page_dict.get(b"Contents") {
3439        Ok(Object::Array(arr)) => arr
3440            .iter()
3441            .filter_map(|object| resolve_stream_content(doc, object))
3442            .collect(),
3443        Ok(Object::Reference(id)) => match doc.get_object(*id) {
3444            Ok(Object::Array(arr)) => arr
3445                .iter()
3446                .filter_map(|object| resolve_stream_content(doc, object))
3447                .collect(),
3448            Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
3449            Err(_) => Vec::new(),
3450        },
3451        Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
3452        Err(_) => Vec::new(),
3453    }
3454}
3455
3456fn resolve_stream_content(doc: &Document, object: &Object) -> Option<Vec<u8>> {
3457    let stream = match object {
3458        Object::Reference(id) => doc.get_object(*id).ok()?.as_stream().ok()?,
3459        Object::Stream(stream) => stream,
3460        _ => return None,
3461    };
3462
3463    stream
3464        .get_plain_content()
3465        .ok()
3466        .or_else(|| Some(stream.content.clone()))
3467}
3468
3469/// Count text-drawing operators (Tj / TJ) in a content stream.
3470fn count_text_operators(stream: &[u8]) -> usize {
3471    let mut count = 0;
3472    for window in stream.windows(3) {
3473        if (window[0] == b' ' || window[0] == b')' || window[0] == b']')
3474            && window[1] == b'T'
3475            && (window[2] == b'j' || window[2] == b'J')
3476        {
3477            count += 1;
3478        }
3479    }
3480    count
3481}
3482
3483/// Bake checkbox/radio button appearance marks from AcroForm widget AP streams
3484/// onto existing page content for dynamic XFA forms.
3485///
3486/// Hybrid XFA PDFs carry pre-rendered appearance streams in their widget `/AP/N`
3487/// dictionaries. For radio/checkbox widgets the Normal appearance dict often has
3488/// only the "on" state (filled circle / checkmark) with no "Off" entry. Only
3489/// widgets that are currently asserted should contribute that mark to the
3490/// flattened page; widgets explicitly in the `Off` state must not be stamped
3491/// with the on-mark just because `/AP/N` lacks an `Off` appearance.
3492fn bake_checkbox_radio_ap_marks(doc: &mut Document, page_id: ObjectId) -> usize {
3493    let annots = page_annotations(doc, page_id);
3494    if annots.is_empty() {
3495        return 0;
3496    }
3497
3498    let mut baked = 0usize;
3499    let mut overlay_ops = Vec::new();
3500
3501    for annot in &annots {
3502        let Some(annot_id) = annot.as_reference().ok() else {
3503            continue;
3504        };
3505        let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
3506            continue;
3507        };
3508
3509        let is_widget = annot_dict
3510            .get(b"Subtype")
3511            .ok()
3512            .and_then(|obj| obj.as_name().ok())
3513            == Some(&b"Widget"[..]);
3514        if !is_widget {
3515            continue;
3516        }
3517
3518        // Radio/checkbox widgets have a dictionary of named states in /AP/N
3519        // (e.g. /N << /0 35 0 R >>).  Text fields and pushbuttons have /AP/N
3520        // as a single stream reference.  Use this to filter.
3521        let ap = match annot_dict.get(b"AP").ok().and_then(|o| o.as_dict().ok()) {
3522            Some(ap) => ap.clone(),
3523            None => continue,
3524        };
3525        let normal_obj = match ap.get(b"N").ok() {
3526            Some(obj) => obj.clone(),
3527            None => continue,
3528        };
3529
3530        // Resolve /N to a dictionary of appearance states.
3531        let states: Dictionary = match &normal_obj {
3532            Object::Reference(id) => match doc.get_object(*id).ok().cloned() {
3533                Some(Object::Dictionary(d)) => d,
3534                _ => continue, // direct stream → not radio/checkbox
3535            },
3536            Object::Dictionary(d) => d.clone(),
3537            _ => continue,
3538        };
3539
3540        if matches!(selected_widget_state(&annot_dict), Some(state) if state == b"Off") {
3541            continue;
3542        }
3543
3544        // Find the first non-"Off" state (the "on" mark appearance).
3545        let on_id = states
3546            .iter()
3547            .filter(|(name, _)| name.as_slice() != b"Off")
3548            .find_map(|(_, obj)| match obj {
3549                Object::Reference(id) => Some(*id),
3550                _ => None,
3551            });
3552        let Some(ap_id) = on_id else { continue };
3553
3554        // Verify the referenced object is a Form XObject stream.
3555        match doc.get_object(ap_id).ok() {
3556            Some(Object::Stream(_)) => {}
3557            _ => continue,
3558        }
3559
3560        let Some(rect) = annotation_rect(&annot_dict) else {
3561            continue;
3562        };
3563
3564        let xobject_name = format!("XfaCbAp{}", baked);
3565        add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
3566        write_ops(
3567            &mut overlay_ops,
3568            format_args!(
3569                "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
3570                rect[0], rect[1], xobject_name
3571            ),
3572        );
3573        baked += 1;
3574    }
3575
3576    if !overlay_ops.is_empty() {
3577        append_to_page_content(doc, page_id, &overlay_ops);
3578    }
3579
3580    baked
3581}
3582
3583fn is_xfa_placeholder_stream(stream: &[u8]) -> bool {
3584    const PLACEHOLDER_MARKERS: [&[u8]; 5] = [
3585        b"Please wait",
3586        b"Adobe Reader",
3587        b"reader_download",
3588        b"display this type of document",
3589        b"To view the full contents",
3590    ];
3591
3592    PLACEHOLDER_MARKERS
3593        .iter()
3594        .any(|marker| contains_ascii_case_insensitive(stream, marker))
3595}
3596
3597/// Detect evaluation-software watermark overlays (e.g. "Qoppa Software",
3598/// "For Evaluation Only"). These are short streams with ≤3 Tj operators
3599/// that should not count as real pre-rendered form content.
3600fn is_watermark_stream(stream: &[u8]) -> bool {
3601    const WATERMARK_MARKERS: [&[u8]; 3] =
3602        [b"Evaluation Only", b"Qoppa Software", b"For Evaluation"];
3603    WATERMARK_MARKERS
3604        .iter()
3605        .any(|marker| contains_ascii_case_insensitive(stream, marker))
3606}
3607
3608fn contains_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
3609    haystack
3610        .windows(needle.len())
3611        .any(|window| window.eq_ignore_ascii_case(needle))
3612}
3613
3614fn write_ops(buf: &mut Vec<u8>, args: std::fmt::Arguments<'_>) {
3615    use std::fmt::Write as _;
3616
3617    let mut text = String::new();
3618    let _ = text.write_fmt(args);
3619    buf.extend_from_slice(text.as_bytes());
3620}
3621
3622/// Flatten Widget annotation appearances onto their pages.
3623///
3624/// Hybrid XFA PDFs often already contain the correct visual representation in
3625/// widget `/AP` streams. Stripping those widgets outright drops borders, text,
3626/// checkboxes, and image buttons. This helper bakes the normal appearance onto
3627/// the page content and removes only the widgets that were successfully
3628/// flattened. Returns the number of widgets flattened.
3629fn flatten_widget_appearances(doc: &mut Document) -> usize {
3630    let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3631    let mut flattened = 0usize;
3632
3633    for page_id in page_ids {
3634        let annots = page_annotations(doc, page_id);
3635        if annots.is_empty() {
3636            continue;
3637        }
3638
3639        let mut retained = Vec::new();
3640        let mut overlay_ops = Vec::new();
3641
3642        for annot in annots {
3643            let Some(annot_id) = annot.as_reference().ok() else {
3644                retained.push(annot);
3645                continue;
3646            };
3647
3648            let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
3649                retained.push(annot);
3650                continue;
3651            };
3652
3653            let is_widget = annot_dict
3654                .get(b"Subtype")
3655                .ok()
3656                .and_then(|obj| obj.as_name().ok())
3657                == Some(&b"Widget"[..]);
3658            if !is_widget {
3659                retained.push(annot);
3660                continue;
3661            }
3662
3663            let Some(rect) = annotation_rect(&annot_dict) else {
3664                retained.push(Object::Reference(annot_id));
3665                continue;
3666            };
3667            let Some(ap_id) = resolve_widget_normal_appearance(doc, &annot_dict) else {
3668                retained.push(Object::Reference(annot_id));
3669                continue;
3670            };
3671
3672            let xobject_name = format!("XfaAp{}", flattened);
3673            add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
3674            write_ops(
3675                &mut overlay_ops,
3676                format_args!(
3677                    "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
3678                    rect[0], rect[1], xobject_name
3679                ),
3680            );
3681            flattened += 1;
3682        }
3683
3684        if overlay_ops.is_empty() {
3685            continue;
3686        }
3687
3688        append_to_page_content(doc, page_id, &overlay_ops);
3689        set_page_annotations(doc, page_id, retained);
3690    }
3691
3692    flattened
3693}
3694
3695/// Remove Widget annotations from a page, keeping non-Widget annotations.
3696fn strip_widget_annotations(doc: &mut Document, page_id: ObjectId) {
3697    let annots = page_annotations(doc, page_id);
3698    if annots.is_empty() {
3699        return;
3700    }
3701    let mut retained = Vec::new();
3702    for annot in &annots {
3703        let is_widget = annot
3704            .as_reference()
3705            .ok()
3706            .and_then(|id| doc.get_dictionary(id).ok())
3707            .and_then(|d| d.get(b"Subtype").ok())
3708            .and_then(|obj| obj.as_name().ok())
3709            == Some(&b"Widget"[..]);
3710        if !is_widget {
3711            retained.push(annot.clone());
3712        }
3713    }
3714    set_page_annotations(doc, page_id, retained);
3715}
3716
3717fn page_annotations(doc: &Document, page_id: ObjectId) -> Vec<Object> {
3718    let Ok(page_dict) = doc.get_dictionary(page_id) else {
3719        return Vec::new();
3720    };
3721
3722    match page_dict.get(b"Annots") {
3723        Ok(Object::Array(arr)) => arr.clone(),
3724        Ok(Object::Reference(id)) => doc
3725            .get_object(*id)
3726            .ok()
3727            .and_then(|obj| obj.as_array().ok().cloned())
3728            .unwrap_or_default(),
3729        _ => Vec::new(),
3730    }
3731}
3732
3733fn set_page_annotations(doc: &mut Document, page_id: ObjectId, annots: Vec<Object>) {
3734    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3735        if annots.is_empty() {
3736            page_dict.remove(b"Annots");
3737        } else {
3738            page_dict.set("Annots", Object::Array(annots));
3739        }
3740    }
3741}
3742
3743fn annotation_rect(dict: &Dictionary) -> Option<[f32; 4]> {
3744    let rect = dict.get(b"Rect").ok()?.as_array().ok()?;
3745    if rect.len() != 4 {
3746        return None;
3747    }
3748    Some([
3749        rect[0].as_float().ok()?,
3750        rect[1].as_float().ok()?,
3751        rect[2].as_float().ok()?,
3752        rect[3].as_float().ok()?,
3753    ])
3754}
3755
3756fn resolve_widget_normal_appearance(
3757    doc: &mut Document,
3758    annot_dict: &Dictionary,
3759) -> Option<ObjectId> {
3760    let ap = annot_dict.get(b"AP").ok()?.as_dict().ok()?;
3761    let normal = ap.get(b"N").ok()?;
3762    resolve_appearance_object(doc, annot_dict, normal)
3763}
3764
3765fn resolve_appearance_object(
3766    doc: &mut Document,
3767    annot_dict: &Dictionary,
3768    object: &Object,
3769) -> Option<ObjectId> {
3770    match object {
3771        Object::Reference(id) => match doc.get_object(*id).ok()?.clone() {
3772            Object::Stream(_) => Some(*id),
3773            Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, &states),
3774            _ => None,
3775        },
3776        Object::Stream(stream) => Some(doc.add_object(Object::Stream(stream.clone()))),
3777        Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, states),
3778        _ => None,
3779    }
3780}
3781
3782fn resolve_appearance_state(
3783    doc: &mut Document,
3784    annot_dict: &Dictionary,
3785    states: &Dictionary,
3786) -> Option<ObjectId> {
3787    if let Some(state) = selected_widget_state(annot_dict) {
3788        if let Ok(object) = states.get(state) {
3789            if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3790                return Some(id);
3791            }
3792        }
3793        if state == b"Off" {
3794            // Previously assumed oracle always stamps on-mark for hybrid XFA widgets.
3795            // Corrected per GL-WF-01/M#55: /AS state from source data is authoritative.
3796            return None;
3797        }
3798    }
3799
3800    for fallback in [b"Yes".as_slice(), b"On".as_slice(), b"Off".as_slice()] {
3801        if let Ok(object) = states.get(fallback) {
3802            if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3803                return Some(id);
3804            }
3805        }
3806    }
3807
3808    for (_name, object) in states.iter() {
3809        if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3810            return Some(id);
3811        }
3812    }
3813
3814    None
3815}
3816
3817fn selected_widget_state(annot_dict: &Dictionary) -> Option<&[u8]> {
3818    annot_dict
3819        .get(b"AS")
3820        .ok()
3821        .and_then(|obj| obj.as_name().ok())
3822        .or_else(|| annot_dict.get(b"V").ok().and_then(|obj| obj.as_name().ok()))
3823}
3824
3825fn add_xobject_to_page_resources(
3826    doc: &mut Document,
3827    page_id: ObjectId,
3828    name: &str,
3829    xobject_id: ObjectId,
3830) {
3831    let resources_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3832        page_dict
3833            .get(b"Resources")
3834            .ok()
3835            .and_then(|obj| obj.as_reference().ok())
3836    });
3837
3838    if let Some(resources_id) = resources_ref {
3839        let xobject_ref = doc.get_dictionary(resources_id).ok().and_then(|resources| {
3840            resources
3841                .get(b"XObject")
3842                .ok()
3843                .and_then(|obj| obj.as_reference().ok())
3844        });
3845
3846        if let Some(xobject_dict_id) = xobject_ref {
3847            if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3848                xobjects.set(name, Object::Reference(xobject_id));
3849                return;
3850            }
3851        }
3852
3853        if let Ok(Object::Dictionary(ref mut resources)) = doc.get_object_mut(resources_id) {
3854            add_xobject_to_resources_dict(resources, name, xobject_id);
3855            return;
3856        }
3857    }
3858
3859    let inline_xobject_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3860        page_dict
3861            .get(b"Resources")
3862            .ok()
3863            .and_then(|obj| obj.as_dict().ok())
3864            .and_then(|resources| {
3865                resources
3866                    .get(b"XObject")
3867                    .ok()
3868                    .and_then(|obj| obj.as_reference().ok())
3869            })
3870    });
3871
3872    if let Some(xobject_dict_id) = inline_xobject_ref {
3873        if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3874            xobjects.set(name, Object::Reference(xobject_id));
3875            return;
3876        }
3877    }
3878
3879    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3880        if let Ok(Object::Dictionary(ref mut resources)) = page_dict.get_mut(b"Resources") {
3881            add_xobject_to_resources_dict(resources, name, xobject_id);
3882            return;
3883        }
3884
3885        let mut resources = Dictionary::new();
3886        add_xobject_to_resources_dict(&mut resources, name, xobject_id);
3887        page_dict.set("Resources", Object::Dictionary(resources));
3888    }
3889}
3890
3891fn add_xobject_to_resources_dict(resources: &mut Dictionary, name: &str, xobject_id: ObjectId) {
3892    if let Ok(Object::Dictionary(ref mut xobjects)) = resources.get_mut(b"XObject") {
3893        xobjects.set(name, Object::Reference(xobject_id));
3894    } else {
3895        let mut xobjects = Dictionary::new();
3896        xobjects.set(name, Object::Reference(xobject_id));
3897        resources.set("XObject", Object::Dictionary(xobjects));
3898    }
3899}
3900
3901fn append_to_page_content(doc: &mut Document, page_id: ObjectId, data: &[u8]) {
3902    let new_stream_id = doc.add_object(Object::Stream(Stream::new(dictionary! {}, data.to_vec())));
3903
3904    let contents = doc
3905        .get_dictionary(page_id)
3906        .ok()
3907        .and_then(|page_dict| page_dict.get(b"Contents").ok().cloned());
3908
3909    // Some PDFs store page /Contents as an indirect array of streams. Appending
3910    // by wrapping that array reference in another array creates nested content
3911    // arrays (`[ 1510 0 R 1574 0 R ]` where `1510 0 R` is itself an array),
3912    // which Poppler treats as "Weird page contents" and can blank the page.
3913    // Flatten the existing /Contents tree first so preserve-static/widget bake
3914    // paths remain valid on Adobe-generated forms like 697eeb9f.
3915    let new_contents = match contents {
3916        Some(existing) => {
3917            let mut flattened = Vec::new();
3918            flatten_page_contents_entries(doc, existing, &mut flattened);
3919            flattened.push(Object::Reference(new_stream_id));
3920            if flattened.len() == 1 {
3921                // SAFETY: len == 1 is checked on the line above.
3922                flattened.pop().expect("flattened.len() == 1")
3923            } else {
3924                Object::Array(flattened)
3925            }
3926        }
3927        None => Object::Reference(new_stream_id),
3928    };
3929
3930    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3931        page_dict.set("Contents", new_contents);
3932    }
3933}
3934
3935fn flatten_page_contents_entries(doc: &mut Document, object: Object, out: &mut Vec<Object>) {
3936    match object {
3937        Object::Reference(id) => match doc.get_object(id).cloned() {
3938            Ok(Object::Array(items)) => {
3939                for item in items {
3940                    flatten_page_contents_entries(doc, item, out);
3941                }
3942            }
3943            _ => out.push(Object::Reference(id)),
3944        },
3945        Object::Array(items) => {
3946            for item in items {
3947                flatten_page_contents_entries(doc, item, out);
3948            }
3949        }
3950        Object::Stream(stream) => {
3951            let stream_id = doc.add_object(Object::Stream(stream));
3952            out.push(Object::Reference(stream_id));
3953        }
3954        other => out.push(other),
3955    }
3956}
3957
3958/// Remove Widget annotations from all pages and strip /AcroForm from the catalog.
3959///
3960/// This is the "static-strip" flatten path used for hybrid XFA+static PDFs:
3961/// the original page content is preserved and only the interactive XFA/AcroForm
3962/// layer is removed.
3963fn strip_widgets_and_acroform(doc: &mut Document) {
3964    remove_acroform(doc);
3965}
3966
3967/// Replace a page's /Contents stream with XFA overlay bytes and add font resource.
3968fn write_page_content(
3969    doc: &mut Document,
3970    page_id: ObjectId,
3971    overlay: &PageOverlay,
3972    font_ids: &[ObjectId; 3],
3973    embedded_fonts: &[(String, ObjectId)],
3974    page_width: Option<f64>,
3975    page_height: Option<f64>,
3976) -> Result<()> {
3977    let mut resources = make_resources_dict(font_ids, embedded_fonts);
3978
3979    let mut xobjects = Dictionary::new();
3980    for img in &overlay.images {
3981        match embed_image(doc, &img.data, &img.mime_type) {
3982            Ok(result) => {
3983                xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3984            }
3985            Err(e) => {
3986                eprintln!("failed to embed image {}: {}", img.name, e);
3987            }
3988        }
3989    }
3990    if !xobjects.is_empty() {
3991        resources.set("XObject", Object::Dictionary(xobjects));
3992    }
3993
3994    let stream = Stream::new(
3995        dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
3996        overlay.content_stream.clone(),
3997    );
3998    let stream_id = doc.add_object(Object::Stream(stream));
3999
4000    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
4001        page_dict.set("Contents", Object::Reference(stream_id));
4002        page_dict.set("Resources", Object::Dictionary(resources));
4003        // Update MediaBox to match the XFA layout page dimensions.
4004        // Dynamic XFA forms often have a placeholder page with different
4005        // dimensions than the template's pageArea (e.g. letter vs A4).
4006        if let (Some(w), Some(h)) = (page_width, page_height) {
4007            page_dict.set(
4008                "MediaBox",
4009                Object::Array(vec![
4010                    Object::Real(0.0),
4011                    Object::Real(0.0),
4012                    Object::Real(w as f32),
4013                    Object::Real(h as f32),
4014                ]),
4015            );
4016        }
4017    }
4018    Ok(())
4019}
4020
4021/// Overlay XFA content on top of existing page content (for static XFA forms).
4022///
4023/// Unlike `write_page_content` which replaces the page content entirely, this
4024/// preserves the original content stream and appends the XFA overlay on top.
4025/// The original resources are preserved and XFA font resources are merged in.
4026fn overlay_page_content(
4027    doc: &mut Document,
4028    page_id: ObjectId,
4029    overlay: &PageOverlay,
4030    font_ids: &[ObjectId; 3],
4031    embedded_fonts: &[(String, ObjectId)],
4032) -> Result<()> {
4033    let xfa_resources = make_resources_dict(font_ids, embedded_fonts);
4034
4035    let mut xfa_xobjects = Dictionary::new();
4036    for img in &overlay.images {
4037        match embed_image(doc, &img.data, &img.mime_type) {
4038            Ok(result) => {
4039                xfa_xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
4040            }
4041            Err(e) => {
4042                eprintln!("failed to embed image {}: {}", img.name, e);
4043            }
4044        }
4045    }
4046
4047    merge_xfa_resources_into_page(doc, page_id, &xfa_resources, &xfa_xobjects);
4048
4049    if !overlay.content_stream.is_empty() {
4050        append_to_page_content(doc, page_id, &overlay.content_stream);
4051    }
4052
4053    Ok(())
4054}
4055
4056/// Merge XFA font/xobject resources into the existing page resources without
4057/// overwriting original entries.
4058fn merge_xfa_resources_into_page(
4059    doc: &mut Document,
4060    page_id: ObjectId,
4061    xfa_resources: &Dictionary,
4062    xfa_xobjects: &Dictionary,
4063) {
4064    let existing_resources = doc
4065        .get_dictionary(page_id)
4066        .ok()
4067        .and_then(|page_dict| {
4068            page_dict.get(b"Resources").ok().and_then(|obj| match obj {
4069                Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
4070                Object::Dictionary(d) => Some(d.clone()),
4071                _ => None,
4072            })
4073        })
4074        .unwrap_or_default();
4075
4076    let mut merged = existing_resources;
4077
4078    // Merge Font entries: add XFA fonts (F1, F2, F3, embedded) without
4079    // overwriting the page's own fonts.
4080    if let Ok(xfa_font_dict) = xfa_resources.get(b"Font").and_then(|o| o.as_dict()) {
4081        let existing_font = merged
4082            .get(b"Font")
4083            .ok()
4084            .and_then(|obj| match obj {
4085                Object::Dictionary(d) => Some(d.clone()),
4086                Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
4087                _ => None,
4088            })
4089            .unwrap_or_default();
4090
4091        let mut font_merged = existing_font;
4092        for (key, val) in xfa_font_dict.iter() {
4093            if font_merged.get(key).is_err() {
4094                font_merged.set(key.clone(), val.clone());
4095            }
4096        }
4097        merged.set("Font", Object::Dictionary(font_merged));
4098    }
4099
4100    // Merge XObject entries.
4101    if !xfa_xobjects.is_empty() {
4102        let existing_xobj = merged
4103            .get(b"XObject")
4104            .ok()
4105            .and_then(|obj| match obj {
4106                Object::Dictionary(d) => Some(d.clone()),
4107                Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
4108                _ => None,
4109            })
4110            .unwrap_or_default();
4111
4112        let mut xobj_merged = existing_xobj;
4113        for (key, val) in xfa_xobjects.iter() {
4114            xobj_merged.set(key.clone(), val.clone());
4115        }
4116        merged.set("XObject", Object::Dictionary(xobj_merged));
4117    }
4118
4119    if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
4120        page_dict.set("Resources", Object::Dictionary(merged));
4121    }
4122}
4123
4124/// Add a new page to the document's /Pages tree.
4125fn add_new_page(
4126    doc: &mut Document,
4127    w: f64,
4128    h: f64,
4129    overlay: &PageOverlay,
4130    font_ids: &[ObjectId; 3],
4131    embedded_fonts: &[(String, ObjectId)],
4132) -> Result<()> {
4133    let mut resources = make_resources_dict(font_ids, embedded_fonts);
4134
4135    let mut xobjects = Dictionary::new();
4136    for img in &overlay.images {
4137        match embed_image(doc, &img.data, &img.mime_type) {
4138            Ok(result) => {
4139                xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
4140            }
4141            Err(e) => {
4142                eprintln!("failed to embed image {}: {}", img.name, e);
4143            }
4144        }
4145    }
4146    if !xobjects.is_empty() {
4147        resources.set("XObject", Object::Dictionary(xobjects));
4148    }
4149
4150    let stream = Stream::new(
4151        dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
4152        overlay.content_stream.clone(),
4153    );
4154    let stream_id = doc.add_object(Object::Stream(stream));
4155
4156    // Find the /Pages root to append to.
4157    let pages_id = find_pages_root(doc)?;
4158
4159    let page_id = doc.add_object(Object::Dictionary(dictionary! {
4160        "Type"      => Object::Name(b"Page".to_vec()),
4161        "Parent"    => Object::Reference(pages_id),
4162        "MediaBox"  => Object::Array(vec![
4163            Object::Integer(0), Object::Integer(0),
4164            Object::Real(w as f32), Object::Real(h as f32),
4165        ]),
4166        "Contents"  => Object::Reference(stream_id),
4167        "Resources" => Object::Dictionary(resources)
4168    }));
4169
4170    // Append to /Kids and increment /Count.
4171    if let Ok(Object::Dictionary(ref mut pages_dict)) = doc.get_object_mut(pages_id) {
4172        if let Ok(Object::Array(ref mut kids)) = pages_dict.get_mut(b"Kids") {
4173            kids.push(Object::Reference(page_id));
4174        }
4175        if let Ok(Object::Integer(ref mut count)) = pages_dict.get_mut(b"Count") {
4176            *count += 1;
4177        }
4178    }
4179    Ok(())
4180}
4181
4182fn make_resources_dict(
4183    font_ids: &[ObjectId; 3],
4184    embedded_fonts: &[(String, ObjectId)],
4185) -> Dictionary {
4186    let mut fonts = Dictionary::new();
4187    fonts.set("F1", Object::Reference(font_ids[0]));
4188    fonts.set("F2", Object::Reference(font_ids[1]));
4189    fonts.set("F3", Object::Reference(font_ids[2]));
4190    for (name, obj_id) in embedded_fonts {
4191        fonts.set(name.as_str(), Object::Reference(*obj_id));
4192    }
4193    let mut resources = Dictionary::new();
4194    resources.set("Font", Object::Dictionary(fonts));
4195    resources
4196}
4197
4198fn find_pages_root(doc: &Document) -> Result<ObjectId> {
4199    let root_id = doc
4200        .trailer
4201        .get(b"Root")
4202        .ok()
4203        .and_then(|o: &Object| o.as_reference().ok())
4204        .ok_or_else(|| XfaError::LoadFailed("no /Root in trailer".to_string()))?;
4205    let catalog = doc
4206        .get_dictionary(root_id)
4207        .map_err(|e| XfaError::LoadFailed(format!("catalog: {e}")))?;
4208    catalog
4209        .get(b"Pages")
4210        .ok()
4211        .and_then(|o: &Object| o.as_reference().ok())
4212        .ok_or_else(|| XfaError::LoadFailed("no /Pages in catalog".to_string()))
4213}
4214
4215/// Remove all interactive XFA/AcroForm artifacts from the PDF document.
4216///
4217/// XFA-F6-02 (#1110): this function ensures the output is a clean static PDF
4218/// with no residual interactive form markers. Steps performed:
4219///
4220/// 1. Remove `/AcroForm` from the catalog.
4221/// 2. Remove `/NeedsRendering` from the catalog.
4222/// 3. Remove `/XFA` from the AcroForm dictionary (if it was an indirect object
4223///    whose dict still exists in the object table).
4224/// 4. Remove orphaned XFA packet objects and the unreachable AcroForm object
4225///    from the lopdf object table.
4226/// 5. Remove widget annotations from all page `/Annots` arrays.
4227/// 6. Remove empty `/Annots` arrays left behind after widget removal.
4228fn remove_acroform(doc: &mut Document) {
4229    let root_id = match doc.trailer.get(b"Root") {
4230        Ok(Object::Reference(id)) => *id,
4231        _ => return,
4232    };
4233
4234    // Step 1 & 2: remove /AcroForm and /NeedsRendering from catalog.
4235    // Also capture the AcroForm object ID so we can clean up /XFA inside it.
4236    let acroform_id: Option<ObjectId> = {
4237        if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
4238            let acroform_ref = dict.get(b"AcroForm").ok().and_then(|o| {
4239                if let Object::Reference(id) = o {
4240                    Some(*id)
4241                } else {
4242                    None
4243                }
4244            });
4245            dict.remove(b"AcroForm");
4246            dict.remove(b"NeedsRendering");
4247            acroform_ref
4248        } else {
4249            None
4250        }
4251    };
4252
4253    // Step 3: collect /XFA stream object IDs, then remove /XFA from the
4254    // AcroForm dictionary object.
4255    let xfa_stream_ids: Vec<ObjectId> = acroform_id
4256        .and_then(|af_id| doc.get_dictionary(af_id).ok())
4257        .map(|af_dict| match af_dict.get(b"XFA") {
4258            Ok(Object::Array(arr)) => arr
4259                .iter()
4260                .filter_map(|o| {
4261                    if let Object::Reference(id) = o {
4262                        Some(*id)
4263                    } else {
4264                        None
4265                    }
4266                })
4267                .collect(),
4268            Ok(Object::Reference(id)) => vec![*id],
4269            _ => Vec::new(),
4270        })
4271        .unwrap_or_default();
4272
4273    if let Some(af_id) = acroform_id {
4274        if let Ok(Object::Dictionary(ref mut af_dict)) = doc.get_object_mut(af_id) {
4275            af_dict.remove(b"XFA");
4276        }
4277    }
4278
4279    // Step 4 (FSC-05): purge orphaned XFA packet objects and the unreachable
4280    // AcroForm dictionary from the object table. lopdf serializes every object
4281    // still present in doc.objects, even if the catalog no longer references it.
4282    for stream_id in xfa_stream_ids {
4283        doc.objects.remove(&stream_id);
4284    }
4285    if let Some(af_id) = acroform_id {
4286        doc.objects.remove(&af_id);
4287    }
4288
4289    // Step 5 & 6: remove widget annotations from every page's /Annots array,
4290    // then drop empty /Annots arrays entirely.
4291    let page_ids: Vec<ObjectId> = doc.page_iter().collect();
4292    for page_id in page_ids {
4293        strip_widget_annotations(doc, page_id);
4294    }
4295}
4296
4297// ---------------------------------------------------------------------------
4298// XFA-F6-03 (#1111): Post-flatten validation
4299// ---------------------------------------------------------------------------
4300
4301/// Result of a post-flatten validation pass.
4302///
4303/// All `has_no_*` fields should be `true` for a clean flat PDF. Any
4304/// remaining XFA artifacts are reported in `warnings`.
4305pub struct FlattenValidation {
4306    /// True when the catalog contains no `/XFA` entry (directly or via AcroForm).
4307    pub has_no_xfa: bool,
4308    /// True when the catalog contains no `/NeedsRendering` entry.
4309    pub has_no_needs_rendering: bool,
4310    /// True when the catalog contains no `/AcroForm` entry.
4311    pub has_no_acroform: bool,
4312    /// Number of pages in the output PDF.
4313    pub page_count: usize,
4314    /// Human-readable warnings for each detected XFA artifact.
4315    pub warnings: Vec<String>,
4316}
4317
4318/// Validate that a PDF has been fully flattened (no XFA/AcroForm artifacts remain).
4319///
4320/// Returns a [`FlattenValidation`] summary. Call after [`flatten_xfa_to_pdf`] to
4321/// confirm the output is clean.
4322///
4323/// This function never panics — parse failures produce a validation result with
4324/// all `has_no_*` fields set to `false` and a warning explaining the parse error.
4325pub fn validate_flattened_pdf(pdf_bytes: &[u8]) -> Result<FlattenValidation> {
4326    if pdf_bytes.is_empty() {
4327        return Ok(FlattenValidation {
4328            has_no_xfa: true,
4329            has_no_needs_rendering: true,
4330            has_no_acroform: true,
4331            page_count: 0,
4332            warnings: vec!["empty input — no PDF to validate".into()],
4333        });
4334    }
4335
4336    let doc = match Document::load_mem(pdf_bytes) {
4337        Ok(d) => d,
4338        Err(e) => {
4339            return Ok(FlattenValidation {
4340                has_no_xfa: false,
4341                has_no_needs_rendering: false,
4342                has_no_acroform: false,
4343                page_count: 0,
4344                warnings: vec![format!("could not parse PDF: {e}")],
4345            });
4346        }
4347    };
4348
4349    let mut warnings = Vec::new();
4350    let mut has_no_xfa = true;
4351    let mut has_no_needs_rendering = true;
4352    let mut has_no_acroform = true;
4353
4354    // Check catalog for AcroForm, NeedsRendering, and XFA.
4355    let root_id = doc.trailer.get(b"Root").ok().and_then(|o| {
4356        if let Object::Reference(id) = o {
4357            Some(*id)
4358        } else {
4359            None
4360        }
4361    });
4362
4363    if let Some(rid) = root_id {
4364        if let Ok(catalog) = doc.get_dictionary(rid) {
4365            if catalog.get(b"AcroForm").is_ok() {
4366                has_no_acroform = false;
4367                warnings.push("/AcroForm still present in catalog".into());
4368
4369                // Check whether the AcroForm dict contains /XFA.
4370                let acroform_has_xfa = catalog
4371                    .get(b"AcroForm")
4372                    .ok()
4373                    .and_then(|o| match o {
4374                        Object::Reference(id) => doc.get_dictionary(*id).ok(),
4375                        Object::Dictionary(d) => Some(d),
4376                        _ => None,
4377                    })
4378                    .map(|d| d.get(b"XFA").is_ok())
4379                    .unwrap_or(false);
4380
4381                if acroform_has_xfa {
4382                    has_no_xfa = false;
4383                    warnings.push("/XFA still present in AcroForm dictionary".into());
4384                }
4385            }
4386
4387            if catalog.get(b"NeedsRendering").is_ok() {
4388                has_no_needs_rendering = false;
4389                warnings.push("/NeedsRendering still present in catalog".into());
4390            }
4391
4392            // Direct /XFA on catalog (non-standard but possible).
4393            if catalog.get(b"XFA").is_ok() {
4394                has_no_xfa = false;
4395                warnings.push("/XFA still present directly in catalog".into());
4396            }
4397        }
4398    }
4399
4400    // Check page annotations for widget annotations.
4401    let page_ids: Vec<ObjectId> = doc.page_iter().collect();
4402    let page_count = page_ids.len();
4403    for page_id in page_ids {
4404        for annot_obj in page_annotations(&doc, page_id) {
4405            let is_widget = annot_obj
4406                .as_reference()
4407                .ok()
4408                .and_then(|id| doc.get_dictionary(id).ok())
4409                .and_then(|d| {
4410                    d.get(b"Subtype")
4411                        .ok()
4412                        .map(|st| st == &Object::Name(b"Widget".to_vec()))
4413                })
4414                .unwrap_or(false);
4415            if is_widget {
4416                warnings.push(format!(
4417                    "widget annotation found on page (object {:?})",
4418                    annot_obj
4419                ));
4420            }
4421        }
4422    }
4423
4424    Ok(FlattenValidation {
4425        has_no_xfa,
4426        has_no_needs_rendering,
4427        has_no_acroform,
4428        page_count,
4429        warnings,
4430    })
4431}
4432
4433// ---------------------------------------------------------------------------
4434// XFA-F6-04 (#1112): Flatten quality metrics
4435// ---------------------------------------------------------------------------
4436
4437/// Metrics comparing a PDF before and after flattening.
4438///
4439/// Used by [`compare_flatten_quality`] and the `flatten-check` CLI subcommand.
4440pub struct FlattenQualityMetrics {
4441    /// Number of pages in the original (pre-flatten) PDF.
4442    pub page_count_before: usize,
4443    /// Number of pages in the flattened (post-flatten) PDF.
4444    pub page_count_after: usize,
4445    /// True when `page_count_before == page_count_after`.
4446    pub page_count_match: bool,
4447    /// Total byte length of all content streams in the original PDF.
4448    pub content_stream_bytes_before: usize,
4449    /// Total byte length of all content streams in the flattened PDF.
4450    pub content_stream_bytes_after: usize,
4451    /// Ratio of after/before content stream bytes. 1.0 = same size, <1.0 = smaller.
4452    /// Returns 1.0 when `content_stream_bytes_before == 0` to avoid division by zero.
4453    pub content_ratio: f64,
4454}
4455
4456/// Compute quality metrics comparing the original PDF to its flattened version.
4457///
4458/// Parses both byte slices and compares page count and total content stream size.
4459/// Returns an error only if both PDFs fail to parse.
4460pub fn compare_flatten_quality(
4461    original_bytes: &[u8],
4462    flattened_bytes: &[u8],
4463) -> Result<FlattenQualityMetrics> {
4464    fn count_pages_and_stream_bytes(pdf_bytes: &[u8]) -> (usize, usize) {
4465        let doc = match Document::load_mem(pdf_bytes) {
4466            Ok(d) => d,
4467            Err(_) => return (0, 0),
4468        };
4469        let page_count = doc.page_iter().count();
4470        let stream_bytes: usize = doc
4471            .objects
4472            .values()
4473            .filter_map(|obj| {
4474                if let Object::Stream(s) = obj {
4475                    // Use decompressed content length when available.
4476                    s.content.len().into()
4477                } else {
4478                    None
4479                }
4480            })
4481            .sum();
4482        (page_count, stream_bytes)
4483    }
4484
4485    let (page_count_before, content_stream_bytes_before) =
4486        count_pages_and_stream_bytes(original_bytes);
4487    let (page_count_after, content_stream_bytes_after) =
4488        count_pages_and_stream_bytes(flattened_bytes);
4489
4490    let content_ratio = if content_stream_bytes_before == 0 {
4491        1.0_f64
4492    } else {
4493        content_stream_bytes_after as f64 / content_stream_bytes_before as f64
4494    };
4495
4496    Ok(FlattenQualityMetrics {
4497        page_count_before,
4498        page_count_after,
4499        page_count_match: page_count_before == page_count_after,
4500        content_stream_bytes_before,
4501        content_stream_bytes_after,
4502        content_ratio,
4503    })
4504}
4505
4506// ---------------------------------------------------------------------------
4507// XFA-F7-02 (#1114): Text completeness validation
4508// ---------------------------------------------------------------------------
4509
4510/// Result of a text completeness validation pass.
4511///
4512/// Compares the data values bound in the original XFA datasets against the
4513/// text content extracted from the flattened PDF to verify all field values
4514/// appear in the output.
4515pub struct TextValidation {
4516    /// Data values extracted from the original XFA datasets XML.
4517    pub expected_values: Vec<String>,
4518    /// Values from `expected_values` that were found in the output text.
4519    pub found_values: Vec<String>,
4520    /// Values from `expected_values` that were NOT found in the output text.
4521    pub missing_values: Vec<String>,
4522    /// Ratio of found/expected. 1.0 means all expected values are present.
4523    /// Returns 1.0 when `expected_values` is empty (nothing to check).
4524    pub completeness_ratio: f64,
4525}
4526
4527/// Extract all non-empty text node values from XFA `<field>` elements in the
4528/// datasets XML packet.
4529fn extract_field_values_from_datasets(datasets_xml: &str) -> Vec<String> {
4530    // Minimal parser: locate every <field …> … </field> block and grab direct
4531    // text content (the value node inside).  We keep this dependency-free by
4532    // doing a simple byte-scan rather than pulling in an XML parser.
4533    let mut values = Vec::new();
4534    let mut remaining = datasets_xml;
4535
4536    while let Some(open_pos) = remaining.find("<field") {
4537        // Advance past the opening tag itself (up to the closing `>`).
4538        let tag_end = match remaining[open_pos..].find('>') {
4539            Some(p) => open_pos + p + 1,
4540            None => break,
4541        };
4542
4543        // Self-closing tag (<field … />) — no value.
4544        if remaining[open_pos..tag_end].ends_with("/>") {
4545            remaining = &remaining[tag_end..];
4546            continue;
4547        }
4548
4549        // Find the matching </field>.
4550        let close_tag = "</field>";
4551        match remaining[tag_end..].find(close_tag) {
4552            Some(close_pos) => {
4553                let inner = &remaining[tag_end..tag_end + close_pos];
4554                // Extract text from a nested <value><text>…</text></value> or
4555                // just plain text between the tags.
4556                let text = extract_innermost_text(inner);
4557                if !text.is_empty() {
4558                    values.push(text);
4559                }
4560                remaining = &remaining[tag_end + close_pos + close_tag.len()..];
4561            }
4562            None => break,
4563        }
4564    }
4565    values
4566}
4567
4568/// Given the inner content of a `<field>` element, return the first non-empty
4569/// text value found (handles `<value><text>…</text></value>` nesting).
4570fn extract_innermost_text(inner: &str) -> String {
4571    // Try <text>…</text> first.
4572    if let Some(start) = inner.find("<text>") {
4573        let content_start = start + "<text>".len();
4574        if let Some(end) = inner[content_start..].find("</text>") {
4575            let s = inner[content_start..content_start + end].trim().to_string();
4576            if !s.is_empty() {
4577                return s;
4578            }
4579        }
4580    }
4581    // Fall back to stripping all XML tags and returning the trimmed text.
4582    let stripped = strip_xml_tags(inner);
4583    stripped.trim().to_string()
4584}
4585
4586/// Remove XML/HTML tags from a string, returning only the text content.
4587fn strip_xml_tags(s: &str) -> String {
4588    let mut out = String::with_capacity(s.len());
4589    let mut in_tag = false;
4590    for ch in s.chars() {
4591        match ch {
4592            '<' => in_tag = true,
4593            '>' => in_tag = false,
4594            _ if !in_tag => out.push(ch),
4595            _ => {}
4596        }
4597    }
4598    out
4599}
4600
4601/// Extract visible text from a PDF content stream by scanning for the `Tj`,
4602/// `TJ`, `'`, and `"` text-showing operators.
4603///
4604/// This is a best-effort scan of the raw (potentially un-decoded) bytes.
4605/// It does not handle all encodings or compressed streams but is sufficient
4606/// for validating that literal ASCII/Latin text values are present.
4607fn extract_text_from_pdf_bytes(pdf_bytes: &[u8]) -> String {
4608    let doc = match Document::load_mem(pdf_bytes) {
4609        Ok(d) => d,
4610        Err(_) => return String::new(),
4611    };
4612
4613    let mut text = String::new();
4614
4615    for obj in doc.objects.values() {
4616        if let Object::Stream(ref stream) = obj {
4617            // Read raw stream content (decompression may fail silently).
4618            let content = match stream.decompressed_content() {
4619                Ok(c) => c,
4620                Err(_) => stream.content.clone(),
4621            };
4622            let fragment = extract_text_from_content_stream(&content);
4623            if !fragment.is_empty() {
4624                text.push(' ');
4625                text.push_str(&fragment);
4626            }
4627        }
4628    }
4629    text
4630}
4631
4632/// Scan a PDF content stream byte slice for string operands attached to
4633/// text-showing operators (Tj, TJ, ', ").
4634fn extract_text_from_content_stream(content: &[u8]) -> String {
4635    let s = String::from_utf8_lossy(content);
4636    let mut result = String::new();
4637
4638    // Find parenthesis-delimited strings: (…) followed optionally by whitespace
4639    // and then one of the text operators.
4640    for (i, ch) in s.char_indices() {
4641        if ch == '(' {
4642            // Collect until matching ')'.
4643            let start = i + 1;
4644            let mut depth: i32 = 1;
4645            let mut end = start;
4646            let bytes = s.as_bytes();
4647            while end < bytes.len() && depth > 0 {
4648                match bytes[end] {
4649                    b'(' => depth += 1,
4650                    b')' => depth -= 1,
4651                    b'\\' => {
4652                        end += 1; // skip escaped char
4653                    }
4654                    _ => {}
4655                }
4656                end += 1;
4657            }
4658            if depth == 0 {
4659                let literal = &s[start..end - 1];
4660                // Only collect printable ASCII — skip binary font strings.
4661                if literal.chars().all(|c| {
4662                    c.is_ascii()
4663                        && (c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation())
4664                }) {
4665                    let trimmed = literal.trim();
4666                    if !trimmed.is_empty() {
4667                        result.push(' ');
4668                        result.push_str(trimmed);
4669                    }
4670                }
4671            }
4672        }
4673    }
4674    result
4675}
4676
4677/// Validate that all data values bound in the original XFA form appear in the
4678/// text content of the flattened PDF.
4679///
4680/// Steps:
4681/// 1. Extract the XFA `datasets` packet from `original_xfa_bytes`.
4682/// 2. Parse all `<field>` values from the datasets XML.
4683/// 3. Scan the flattened PDF's content streams for those strings.
4684/// 4. Return a [`TextValidation`] with completeness metrics.
4685///
4686/// Returns `Ok` even when the datasets packet is absent or the XFA cannot be
4687/// parsed — in that case `expected_values` will be empty and
4688/// `completeness_ratio` will be `1.0`.
4689pub fn validate_text_completeness(
4690    original_xfa_bytes: &[u8],
4691    flattened_bytes: &[u8],
4692) -> crate::error::Result<TextValidation> {
4693    // Step 1: extract the datasets packet from the original XFA PDF.
4694    let packets = match crate::extract::extract_xfa_from_bytes(original_xfa_bytes.to_vec()) {
4695        Ok(p) => p,
4696        Err(_) => {
4697            // Cannot extract XFA — nothing to validate.
4698            return Ok(TextValidation {
4699                expected_values: vec![],
4700                found_values: vec![],
4701                missing_values: vec![],
4702                completeness_ratio: 1.0,
4703            });
4704        }
4705    };
4706
4707    let datasets_xml = match packets.datasets() {
4708        Some(ds) => ds.to_string(),
4709        None => {
4710            return Ok(TextValidation {
4711                expected_values: vec![],
4712                found_values: vec![],
4713                missing_values: vec![],
4714                completeness_ratio: 1.0,
4715            });
4716        }
4717    };
4718
4719    // Step 2: extract field values.
4720    let expected_values = extract_field_values_from_datasets(&datasets_xml);
4721
4722    if expected_values.is_empty() {
4723        return Ok(TextValidation {
4724            expected_values: vec![],
4725            found_values: vec![],
4726            missing_values: vec![],
4727            completeness_ratio: 1.0,
4728        });
4729    }
4730
4731    // Step 3: extract text from the flattened PDF.
4732    let output_text = extract_text_from_pdf_bytes(flattened_bytes);
4733
4734    // Step 4: check which expected values appear in the output.
4735    let mut found_values = Vec::new();
4736    let mut missing_values = Vec::new();
4737
4738    for value in &expected_values {
4739        if output_text.contains(value.as_str()) {
4740            found_values.push(value.clone());
4741        } else {
4742            missing_values.push(value.clone());
4743        }
4744    }
4745
4746    let completeness_ratio = if expected_values.is_empty() {
4747        1.0
4748    } else {
4749        found_values.len() as f64 / expected_values.len() as f64
4750    };
4751
4752    Ok(TextValidation {
4753        expected_values,
4754        found_values,
4755        missing_values,
4756        completeness_ratio,
4757    })
4758}
4759
4760// ---------------------------------------------------------------------------
4761// Tests
4762// ---------------------------------------------------------------------------
4763
4764/// Test helper (GL-QA36): simulate a re-entrant call to flatten_xfa_to_pdf by
4765/// pre-setting FLATTEN_DEPTH to 1 before calling.  This is used to verify the
4766/// recursion guard without exposing the thread-local to the test sub-module.
4767///
4768/// IMPORTANT: This function resets FLATTEN_DEPTH to 0 before returning so that
4769/// subsequent calls on the same thread are not affected.
4770#[cfg(test)]
4771fn flatten_xfa_to_pdf_simulate_reentrant(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
4772    FLATTEN_DEPTH.with(|d| d.set(1));
4773    let result = flatten_xfa_to_pdf(pdf_bytes);
4774    // Reset — the guard will have left depth at 1 because it detected depth>=1
4775    // and returned early before the DepthGuard could decrement.
4776    FLATTEN_DEPTH.with(|d| d.set(0));
4777    result
4778}
4779
4780#[cfg(test)]
4781mod tests {
4782    use super::*;
4783
4784    /// Build a minimal XFA PDF in memory (same as generate_xfa_layout_fixtures).
4785    fn build_xfa_pdf_with_content(xdp: &str, page_content: Vec<u8>) -> Vec<u8> {
4786        use lopdf::{dictionary, Document, Object, Stream};
4787        let mut doc = Document::with_version("1.4");
4788        let xdp_bytes = xdp.as_bytes().to_vec();
4789        let xfa_stream = Stream::new(
4790            dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4791            xdp_bytes,
4792        );
4793        let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4794        let pages_id = doc.new_object_id();
4795        let content_stream = Stream::new(
4796            dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4797            page_content,
4798        );
4799        let content_id = doc.add_object(Object::Stream(content_stream));
4800        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4801            "Type"     => Object::Name(b"Page".to_vec()),
4802            "Parent"   => Object::Reference(pages_id),
4803            "MediaBox" => Object::Array(vec![
4804                Object::Integer(0), Object::Integer(0),
4805                Object::Integer(612), Object::Integer(792),
4806            ]),
4807            "Contents" => Object::Reference(content_id)
4808        }));
4809        doc.objects.insert(
4810            pages_id,
4811            Object::Dictionary(dictionary! {
4812                "Type"  => Object::Name(b"Pages".to_vec()),
4813                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
4814                "Count" => Object::Integer(1)
4815            }),
4816        );
4817        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4818            "XFA"    => Object::Reference(xfa_id),
4819            "Fields" => Object::Array(vec![])
4820        }));
4821        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4822            "Type"     => Object::Name(b"Catalog".to_vec()),
4823            "Pages"    => Object::Reference(pages_id),
4824            "AcroForm" => Object::Reference(acroform_id)
4825        }));
4826        doc.trailer.set("Root", Object::Reference(catalog_id));
4827        let mut out = Vec::new();
4828        doc.save_to(&mut out).unwrap();
4829        out
4830    }
4831
4832    fn build_xfa_pdf(xdp: &str) -> Vec<u8> {
4833        build_xfa_pdf_with_content(xdp, Vec::new())
4834    }
4835
4836    fn build_xfa_doc_with_xfa_array() -> (Document, ObjectId, Vec<ObjectId>) {
4837        use lopdf::{dictionary, Document, Object, Stream};
4838
4839        let mut doc = Document::with_version("1.4");
4840        let pages_id = doc.new_object_id();
4841        let content_id = doc.add_object(Object::Stream(Stream::new(
4842            dictionary! { "Length" => Object::Integer(0) },
4843            Vec::new(),
4844        )));
4845        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4846            "Type"     => Object::Name(b"Page".to_vec()),
4847            "Parent"   => Object::Reference(pages_id),
4848            "MediaBox" => Object::Array(vec![
4849                Object::Integer(0), Object::Integer(0),
4850                Object::Integer(612), Object::Integer(792),
4851            ]),
4852            "Contents" => Object::Reference(content_id)
4853        }));
4854        doc.objects.insert(
4855            pages_id,
4856            Object::Dictionary(dictionary! {
4857                "Type"  => Object::Name(b"Pages".to_vec()),
4858                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
4859                "Count" => Object::Integer(1)
4860            }),
4861        );
4862
4863        let packet_payloads = [
4864            (
4865                b"xdp:xdp".to_vec(),
4866                br#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#.to_vec(),
4867            ),
4868            (
4869                b"template".to_vec(),
4870                br#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform/></template>"#
4871                    .to_vec(),
4872            ),
4873            (
4874                b"datasets".to_vec(),
4875                br#"<xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"></xfa:datasets>"#
4876                    .to_vec(),
4877            ),
4878        ];
4879
4880        let mut xfa_array = Vec::new();
4881        let mut xfa_ids = Vec::new();
4882        for (packet_name, payload) in packet_payloads {
4883            let stream_id = doc.add_object(Object::Stream(Stream::new(
4884                dictionary! { "Length" => Object::Integer(payload.len() as i64) },
4885                payload,
4886            )));
4887            xfa_array.push(Object::Name(packet_name));
4888            xfa_array.push(Object::Reference(stream_id));
4889            xfa_ids.push(stream_id);
4890        }
4891
4892        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4893            "XFA"    => Object::Array(xfa_array),
4894            "Fields" => Object::Array(vec![])
4895        }));
4896        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4897            "Type"     => Object::Name(b"Catalog".to_vec()),
4898            "Pages"    => Object::Reference(pages_id),
4899            "AcroForm" => Object::Reference(acroform_id)
4900        }));
4901        doc.trailer.set("Root", Object::Reference(catalog_id));
4902        (doc, acroform_id, xfa_ids)
4903    }
4904
4905    fn build_xfa_pdf_with_widget_appearance(
4906        page_content: Vec<u8>,
4907        normal_appearance: Object,
4908        widget_extra: Dictionary,
4909    ) -> Vec<u8> {
4910        use lopdf::{dictionary, Document, Object, Stream};
4911
4912        let mut doc = Document::with_version("1.4");
4913        let xdp_bytes = SIMPLE_XDP.as_bytes().to_vec();
4914        let xfa_stream = Stream::new(
4915            dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4916            xdp_bytes,
4917        );
4918        let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4919
4920        let pages_id = doc.new_object_id();
4921        let content_id = doc.add_object(Object::Stream(Stream::new(
4922            dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4923            page_content,
4924        )));
4925
4926        let appearance_id = match normal_appearance {
4927            Object::Reference(id) => id,
4928            other => doc.add_object(other),
4929        };
4930
4931        let widget_id = doc.new_object_id();
4932        let page_id = doc.add_object(Object::Dictionary(dictionary! {
4933            "Type"     => Object::Name(b"Page".to_vec()),
4934            "Parent"   => Object::Reference(pages_id),
4935            "MediaBox" => Object::Array(vec![
4936                Object::Integer(0), Object::Integer(0),
4937                Object::Integer(612), Object::Integer(792),
4938            ]),
4939            "Contents" => Object::Reference(content_id),
4940            "Annots"   => Object::Array(vec![Object::Reference(widget_id)]),
4941            "Resources" => Object::Dictionary(dictionary! {})
4942        }));
4943
4944        let mut widget = dictionary! {
4945            "Type"    => Object::Name(b"Annot".to_vec()),
4946            "Subtype" => Object::Name(b"Widget".to_vec()),
4947            "Rect"    => Object::Array(vec![
4948                Object::Integer(100), Object::Integer(700),
4949                Object::Integer(220), Object::Integer(730),
4950            ]),
4951            "AP"      => Object::Dictionary(dictionary! {
4952                "N" => Object::Reference(appearance_id)
4953            }),
4954            "P"       => Object::Reference(page_id)
4955        };
4956        for (key, value) in widget_extra {
4957            widget.set(key, value);
4958        }
4959        doc.objects.insert(widget_id, Object::Dictionary(widget));
4960
4961        doc.objects.insert(
4962            pages_id,
4963            Object::Dictionary(dictionary! {
4964                "Type"  => Object::Name(b"Pages".to_vec()),
4965                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
4966                "Count" => Object::Integer(1)
4967            }),
4968        );
4969
4970        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4971            "XFA"    => Object::Reference(xfa_id),
4972            "Fields" => Object::Array(vec![Object::Reference(widget_id)])
4973        }));
4974        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4975            "Type"     => Object::Name(b"Catalog".to_vec()),
4976            "Pages"    => Object::Reference(pages_id),
4977            "AcroForm" => Object::Reference(acroform_id)
4978        }));
4979        doc.trailer.set("Root", Object::Reference(catalog_id));
4980
4981        let mut out = Vec::new();
4982        doc.save_to(&mut out).unwrap();
4983        out
4984    }
4985
4986    #[allow(dead_code)]
4987    fn find_last_content_stream(doc: &Document, page_id: ObjectId) -> &Stream {
4988        let page_dict = doc.get_dictionary(page_id).expect("page dict");
4989        match page_dict.get(b"Contents").expect("contents") {
4990            Object::Reference(id) => doc
4991                .get_object(*id)
4992                .expect("contents object")
4993                .as_stream()
4994                .expect("contents stream"),
4995            Object::Array(arr) => {
4996                let last = arr.last().expect("last content stream");
4997                let id = last.as_reference().expect("contents ref");
4998                doc.get_object(id)
4999                    .expect("contents object")
5000                    .as_stream()
5001                    .expect("contents stream")
5002            }
5003            other => other.as_stream().expect("contents stream"),
5004        }
5005    }
5006
5007    #[allow(dead_code)]
5008    fn page_xobjects(doc: &Document, page_id: ObjectId) -> Dictionary {
5009        let page_dict = doc.get_dictionary(page_id).expect("page dict");
5010        let resources = page_dict
5011            .get(b"Resources")
5012            .expect("resources")
5013            .as_dict()
5014            .expect("resources dict");
5015        resources
5016            .get(b"XObject")
5017            .expect("xobjects")
5018            .as_dict()
5019            .expect("xobject dict")
5020            .clone()
5021    }
5022
5023    #[test]
5024    fn append_to_page_content_flattens_indirect_contents_arrays() {
5025        let mut doc = Document::with_version("1.4");
5026        let pages_id = doc.new_object_id();
5027        let first_stream_id = doc.add_object(Stream::new(dictionary! {}, b"q\n".to_vec()));
5028        let second_stream_id = doc.add_object(Stream::new(dictionary! {}, b"Q\n".to_vec()));
5029        let contents_array_id = doc.add_object(Object::Array(vec![
5030            Object::Reference(first_stream_id),
5031            Object::Reference(second_stream_id),
5032        ]));
5033        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5034            "Type" => Object::Name(b"Page".to_vec()),
5035            "Parent" => Object::Reference(pages_id),
5036            "MediaBox" => Object::Array(vec![
5037                Object::Integer(0), Object::Integer(0),
5038                Object::Integer(612), Object::Integer(792),
5039            ]),
5040            "Contents" => Object::Reference(contents_array_id),
5041        }));
5042        doc.objects.insert(
5043            pages_id,
5044            Object::Dictionary(dictionary! {
5045                "Type" => Object::Name(b"Pages".to_vec()),
5046                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5047                "Count" => Object::Integer(1),
5048            }),
5049        );
5050
5051        append_to_page_content(&mut doc, page_id, b"BT\nET\n");
5052
5053        let page_dict = doc.get_dictionary(page_id).expect("page dict");
5054        let contents = page_dict.get(b"Contents").expect("contents");
5055        let items = contents.as_array().expect("flattened contents array");
5056
5057        assert_eq!(items.len(), 3, "existing streams + appended stream");
5058        assert!(
5059            items.iter().all(|obj| obj.as_reference().is_ok()),
5060            "contents array must stay flat and reference only streams"
5061        );
5062        for object in items {
5063            let stream_id = object.as_reference().expect("stream ref");
5064            assert!(
5065                doc.get_object(stream_id)
5066                    .expect("stream object")
5067                    .as_stream()
5068                    .is_ok(),
5069                "nested arrays must not survive in page contents"
5070            );
5071        }
5072    }
5073
5074    const SIMPLE_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
5075<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5076<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5077  <subform name="form1" layout="paginate">
5078    <pageSet>
5079      <pageArea name="Page1">
5080        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5081        <medium stock="default" short="8.5in" long="11in"/>
5082      </pageArea>
5083    </pageSet>
5084    <subform name="section" layout="tb" w="7.5in">
5085      <field name="firstName" w="3.5in" h="0.3in">
5086        <caption><value><text>First Name</text></value></caption>
5087        <ui><textEdit/></ui>
5088        <value><text>John</text></value>
5089      </field>
5090    </subform>
5091  </subform>
5092</template>
5093</xdp:xdp>"#;
5094
5095    const JS_EVENT_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
5096<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5097<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5098  <subform name="form1" layout="paginate">
5099    <pageSet>
5100      <pageArea name="Page1">
5101        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5102        <medium stock="default" short="8.5in" long="11in"/>
5103      </pageArea>
5104    </pageSet>
5105    <subform name="section" layout="tb" w="7.5in">
5106      <event activity="initialize">
5107        <script contentType="application/x-javascript">app.alert('blocked');</script>
5108      </event>
5109      <field name="firstName" w="3.5in" h="0.3in">
5110        <caption><value><text>First Name</text></value></caption>
5111        <ui><textEdit/></ui>
5112        <value><text>John</text></value>
5113      </field>
5114    </subform>
5115  </subform>
5116</template>
5117</xdp:xdp>"#;
5118
5119    fn overflowing_paginate_xdp(base_profile: Option<&str>) -> String {
5120        let mut fields = String::new();
5121        for i in 0..40 {
5122            fields.push_str(&format!(
5123                r#"
5124      <field name="line{i}" w="7.0in" h="0.3in">
5125        <ui><textEdit/></ui>
5126        <value><text>Line {i}</text></value>
5127      </field>"#
5128            ));
5129        }
5130
5131        let base_profile_attr = base_profile
5132            .map(|value| format!(r#" baseProfile="{value}""#))
5133            .unwrap_or_default();
5134
5135        format!(
5136            r#"<?xml version="1.0" encoding="UTF-8"?>
5137<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5138<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"{base_profile_attr}>
5139  <subform name="form1" layout="paginate">
5140    <pageSet>
5141      <pageArea name="Page1">
5142        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5143        <medium stock="default" short="8.5in" long="11in"/>
5144      </pageArea>
5145    </pageSet>
5146    <subform name="section" layout="tb" w="7.5in">{fields}
5147    </subform>
5148  </subform>
5149</template>
5150</xdp:xdp>"#
5151        )
5152    }
5153
5154    #[test]
5155    fn flatten_simple_form_produces_non_empty_content() {
5156        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5157        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5158
5159        // Load the result and check the content stream is non-empty.
5160        let doc = Document::load_mem(&result).expect("load flattened PDF");
5161        let pages: Vec<ObjectId> = doc.page_iter().collect();
5162        assert!(!pages.is_empty(), "flattened PDF has no pages");
5163
5164        // At least one page should have a non-empty content stream.
5165        let mut found_content = false;
5166        for page_id in &pages {
5167            if let Ok(page_dict) = doc.get_dictionary(*page_id) {
5168                if let Ok(Object::Reference(stream_id)) = page_dict.get(b"Contents") {
5169                    if let Ok(obj) = doc.get_object(*stream_id) {
5170                        if let Ok(stream) = obj.as_stream() {
5171                            if !stream.content.is_empty() {
5172                                found_content = true;
5173                            }
5174                        }
5175                    }
5176                }
5177            }
5178        }
5179        assert!(found_content, "all content streams are empty after flatten");
5180    }
5181
5182    #[test]
5183    fn flatten_reports_best_effort_for_xfa_javascript_event() {
5184        let pdf_bytes = build_xfa_pdf(JS_EVENT_XDP);
5185
5186        let (flattened, metadata) =
5187            flatten_xfa_to_pdf_with_metadata(&pdf_bytes).expect("flatten should skip JS");
5188
5189        assert!(!flattened.is_empty());
5190        assert_eq!(metadata.output_quality, OutputQuality::BestEffort);
5191        assert!(metadata.dynamic_scripts.js_present);
5192        assert_eq!(metadata.dynamic_scripts.js_skipped, 1);
5193    }
5194
5195    #[test]
5196    fn flatten_strips_catalog_open_action_javascript() {
5197        let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5198        {
5199            let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5200            let root_id = match doc.trailer.get(b"Root") {
5201                Ok(Object::Reference(id)) => *id,
5202                _ => panic!("no Root in test PDF"),
5203            };
5204            if let Ok(Object::Dictionary(catalog)) = doc.get_object_mut(root_id) {
5205                catalog.set(
5206                    "OpenAction",
5207                    Object::Dictionary(dictionary! {
5208                        "S" => Object::Name(b"JavaScript".to_vec()),
5209                        "JS" => Object::String(
5210                            b"app.alert('blocked')".to_vec(),
5211                            lopdf::StringFormat::Literal,
5212                        ),
5213                    }),
5214                );
5215            }
5216            let mut out = Vec::new();
5217            doc.save_to(&mut out).expect("save test PDF");
5218            pdf_bytes = out;
5219        }
5220
5221        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5222        let doc = Document::load_mem(&flattened).expect("load flattened PDF");
5223        let root_id = match doc.trailer.get(b"Root") {
5224            Ok(Object::Reference(id)) => *id,
5225            _ => panic!("no Root in flattened PDF"),
5226        };
5227        let catalog = doc.get_dictionary(root_id).expect("catalog dict");
5228        assert!(
5229            catalog.get(b"OpenAction").is_err(),
5230            "/OpenAction JavaScript must be stripped from flattened output"
5231        );
5232    }
5233
5234    /// Tests the canonical XFA nesting: <subform layout="paginate"> wraps
5235    /// <pageSet> + lr-tb content rows.  Verifies the flatten produces a single
5236    /// page with visible field content (border operators in the content stream).
5237    /// Before the extract_page_structure fix this produced 2 pages: page 1
5238    /// was blank (pageSet occupied 792pt) and page 2 had the actual fields.
5239    #[test]
5240    fn flatten_paginate_subform_with_nested_pageset_produces_visible_content() {
5241        const LR_TB_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
5242<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5243<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5244  <subform name="form1" layout="paginate" locale="en_US">
5245    <pageSet>
5246      <pageArea name="Page1" id="Page1">
5247        <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5248        <medium stock="default" short="8.5in" long="11in"/>
5249      </pageArea>
5250    </pageSet>
5251    <subform name="row1" layout="lr-tb" w="7.5in" h="0.4in">
5252      <field name="firstName" w="3.5in" h="0.4in">
5253        <caption><value><text>First</text></value></caption>
5254        <ui><textEdit/></ui>
5255        <value><text>John</text></value>
5256      </field>
5257      <field name="lastName" w="3.5in" h="0.4in">
5258        <caption><value><text>Last</text></value></caption>
5259        <ui><textEdit/></ui>
5260        <value><text>Doe</text></value>
5261      </field>
5262    </subform>
5263  </subform>
5264</template>
5265</xdp:xdp>"#;
5266
5267        let pdf_bytes = build_xfa_pdf(LR_TB_XDP);
5268        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5269
5270        let doc = Document::load_mem(&result).expect("load flattened PDF");
5271        let pages: Vec<ObjectId> = doc.page_iter().collect();
5272
5273        // Must produce exactly 1 page (not 2 as with the blank-first-page bug).
5274        assert_eq!(pages.len(), 1, "expected 1 page, got {}", pages.len());
5275
5276        // Page 1 must contain visible text operators from the field values.
5277        // (Fields with non-empty values produce WrappedText → BT/ET operators.)
5278        if let Ok(page_dict) = doc.get_dictionary(pages[0]) {
5279            if let Ok(lopdf::Object::Reference(stream_id)) = page_dict.get(b"Contents") {
5280                if let Ok(obj) = doc.get_object(*stream_id) {
5281                    if let Ok(stream) = obj.as_stream() {
5282                        let content = String::from_utf8_lossy(&stream.content);
5283                        assert!(
5284                            content.contains("BT\n"),
5285                            "no text operators in page 1 content stream (should have BT from field values)"
5286                        );
5287                        assert!(
5288                            content.contains("Tj\n"),
5289                            "no text show operators in page 1 content stream"
5290                        );
5291                    }
5292                }
5293            }
5294        }
5295    }
5296
5297    #[test]
5298    fn static_single_page_pdf_does_not_append_xfa_overflow_pages() {
5299        let xdp = overflowing_paginate_xdp(Some("interactiveForms"));
5300        let pdf_bytes = build_xfa_pdf(&xdp);
5301        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5302
5303        let doc = Document::load_mem(&result).expect("load flattened PDF");
5304        let pages: Vec<ObjectId> = doc.page_iter().collect();
5305
5306        assert_eq!(
5307            pages.len(),
5308            1,
5309            "static 1-page PDFs should preserve the original page when XFA layout over-paginates"
5310        );
5311    }
5312
5313    #[test]
5314    fn dynamic_single_page_pdf_can_expand_beyond_original_page_count() {
5315        // Dynamic XFA forms may ship with a single placeholder PDF page while
5316        // Adobe lays out multiple pages from the XFA data/template at runtime.
5317        // Flattening must therefore preserve the layout engine's page count
5318        // instead of clamping to the original PDF page count.
5319        let xdp = overflowing_paginate_xdp(None);
5320        let pdf_bytes = build_xfa_pdf(&xdp);
5321        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5322
5323        let doc = Document::load_mem(&result).expect("load flattened PDF");
5324        let pages: Vec<ObjectId> = doc.page_iter().collect();
5325
5326        assert_eq!(
5327            pages.len(),
5328            2,
5329            "dynamic 1-page PDFs should be allowed to grow when XFA layout paginates"
5330        );
5331    }
5332
5333    #[test]
5334    fn flatten_removes_acroform() {
5335        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5336        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5337        let doc = Document::load_mem(&result).expect("load flattened PDF");
5338        let root_id = doc.trailer.get(b"Root").unwrap().as_reference().unwrap();
5339        let catalog = doc.get_dictionary(root_id).unwrap();
5340        assert!(
5341            catalog.get(b"AcroForm").is_err(),
5342            "/AcroForm still present after flatten"
5343        );
5344    }
5345
5346    #[test]
5347    fn flatten_non_xfa_pdf_unchanged() {
5348        // A PDF with no XFA should be returned as-is (no error).
5349        let mut doc = Document::with_version("1.4");
5350        let pages_id = doc.new_object_id();
5351        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5352            "Type"   => Object::Name(b"Page".to_vec()),
5353            "Parent" => Object::Reference(pages_id),
5354            "MediaBox" => Object::Array(vec![
5355                Object::Integer(0), Object::Integer(0),
5356                Object::Integer(612), Object::Integer(792),
5357            ])
5358        }));
5359        doc.objects.insert(
5360            pages_id,
5361            Object::Dictionary(dictionary! {
5362                "Type"  => Object::Name(b"Pages".to_vec()),
5363                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5364                "Count" => Object::Integer(1)
5365            }),
5366        );
5367        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5368            "Type"  => Object::Name(b"Catalog".to_vec()),
5369            "Pages" => Object::Reference(pages_id)
5370        }));
5371        doc.trailer.set("Root", Object::Reference(catalog_id));
5372        let mut raw = Vec::new();
5373        doc.save_to(&mut raw).unwrap();
5374
5375        // flatten_xfa_to_pdf should return Ok (with the same bytes).
5376        let result = flatten_xfa_to_pdf(&raw).expect("flatten non-XFA failed");
5377        assert!(!result.is_empty());
5378    }
5379
5380    #[test]
5381    fn placeholder_only_page_does_not_trigger_static_strip_path() {
5382        const PLACEHOLDER_STREAM: &str = r#"BT
5383/Helv 24 Tf
538472 720 Td
5385(Please wait...) Tj
53860 -32 Td
5387(If this message is not eventually replaced by the proper contents of the document,) Tj
53880 -32 Td
5389(your PDF viewer may not be able to display this type of document.) Tj
53900 -32 Td
5391(You can upgrade to the latest version of Adobe Reader by visiting reader_download.) Tj
5392ET
5393"#;
5394
5395        let pdf_bytes =
5396            build_xfa_pdf_with_content(SIMPLE_XDP, PLACEHOLDER_STREAM.as_bytes().to_vec());
5397        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5398
5399        let doc = Document::load_mem(&result).expect("load flattened PDF");
5400        let page_id = doc.page_iter().next().expect("flattened page");
5401        let page_dict = doc.get_dictionary(page_id).expect("page dict");
5402        let contents_id = page_dict
5403            .get(b"Contents")
5404            .ok()
5405            .and_then(|object| object.as_reference().ok())
5406            .expect("contents ref");
5407        let stream = doc
5408            .get_object(contents_id)
5409            .expect("contents object")
5410            .as_stream()
5411            .expect("contents stream");
5412        let content = String::from_utf8_lossy(&stream.content);
5413
5414        assert!(
5415            content.contains("John"),
5416            "flattened page should contain XFA-rendered field content"
5417        );
5418        assert!(
5419            !content.contains("Please wait"),
5420            "placeholder text should not survive XFA flattening"
5421        );
5422    }
5423
5424    #[test]
5425    fn hybrid_static_pdf_uses_xfa_layout_over_static_content() {
5426        // When a PDF has both XFA template and static page content,
5427        // XFA layout should always take priority — the static content
5428        // may be a pre-rendered preview with wrong page count (#744).
5429        let appearance = Object::Stream(Stream::new(
5430            dictionary! {
5431                "Type" => Object::Name(b"XObject".to_vec()),
5432                "Subtype" => Object::Name(b"Form".to_vec()),
5433                "BBox" => Object::Array(vec![
5434                    Object::Integer(0), Object::Integer(0),
5435                    Object::Integer(120), Object::Integer(30),
5436                ]),
5437                "Matrix" => Object::Array(vec![
5438                    Object::Integer(1), Object::Integer(0),
5439                    Object::Integer(0), Object::Integer(1),
5440                    Object::Integer(0), Object::Integer(0),
5441                ]),
5442                "Resources" => Object::Dictionary(dictionary! {}),
5443            },
5444            b"0 G\n0.5 0.5 119 29 re\ns\n".to_vec(),
5445        ));
5446        // Enough Tj operators (≥5) to exceed the old static content threshold.
5447        let page_content = b"BT /F1 12 Tf 72 720 Td (Line 1) Tj 0 -14 Td (Line 2) Tj 0 -14 Td (Line 3) Tj 0 -14 Td (Line 4) Tj 0 -14 Td (Line 5) Tj ET\n".to_vec();
5448        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5449            page_content,
5450            appearance,
5451            dictionary! {
5452                "FT" => Object::Name(b"Tx".to_vec()),
5453                "T" => Object::string_literal("field[0]"),
5454            },
5455        );
5456
5457        let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5458        let doc = Document::load_mem(&result).expect("load flattened PDF");
5459        let page_id = doc.page_iter().next().expect("page");
5460        let page_dict = doc.get_dictionary(page_id).expect("page dict");
5461
5462        // XFA layout produces pages without widget annotations.
5463        assert!(
5464            page_dict.get(b"Annots").is_err(),
5465            "XFA-flattened page should have no annotations"
5466        );
5467    }
5468
5469    #[test]
5470    fn hybrid_static_pdf_uses_selected_button_appearance_state() {
5471        let yes_stream = Object::Stream(Stream::new(
5472            dictionary! {
5473                "Type" => Object::Name(b"XObject".to_vec()),
5474                "Subtype" => Object::Name(b"Form".to_vec()),
5475                "BBox" => Object::Array(vec![
5476                    Object::Integer(0), Object::Integer(0),
5477                    Object::Integer(20), Object::Integer(20),
5478                ]),
5479                "Matrix" => Object::Array(vec![
5480                    Object::Integer(1), Object::Integer(0),
5481                    Object::Integer(0), Object::Integer(1),
5482                    Object::Integer(0), Object::Integer(0),
5483                ]),
5484                "Resources" => Object::Dictionary(dictionary! {}),
5485            },
5486            b"BT /F1 8 Tf 1 1 Td (YES) Tj ET\n".to_vec(),
5487        ));
5488        let off_stream = Object::Stream(Stream::new(
5489            dictionary! {
5490                "Type" => Object::Name(b"XObject".to_vec()),
5491                "Subtype" => Object::Name(b"Form".to_vec()),
5492                "BBox" => Object::Array(vec![
5493                    Object::Integer(0), Object::Integer(0),
5494                    Object::Integer(20), Object::Integer(20),
5495                ]),
5496                "Matrix" => Object::Array(vec![
5497                    Object::Integer(1), Object::Integer(0),
5498                    Object::Integer(0), Object::Integer(1),
5499                    Object::Integer(0), Object::Integer(0),
5500                ]),
5501                "Resources" => Object::Dictionary(dictionary! {}),
5502            },
5503            b"BT /F1 8 Tf 1 1 Td (OFF) Tj ET\n".to_vec(),
5504        ));
5505
5506        let mut doc = Document::with_version("1.4");
5507        let state_id = doc.add_object(Object::Dictionary(dictionary! {
5508            "Yes" => yes_stream,
5509            "Off" => off_stream,
5510        }));
5511        let annot = dictionary! {
5512            "Subtype" => Object::Name(b"Widget".to_vec()),
5513            "Rect" => Object::Array(vec![
5514                Object::Integer(100), Object::Integer(700),
5515                Object::Integer(120), Object::Integer(720),
5516            ]),
5517            "AP" => Object::Dictionary(dictionary! {
5518                "N" => Object::Reference(state_id),
5519            }),
5520            "AS" => Object::Name(b"Yes".to_vec()),
5521            "FT" => Object::Name(b"Btn".to_vec()),
5522        };
5523        let ap_id =
5524            resolve_widget_normal_appearance(&mut doc, &annot).expect("selected normal appearance");
5525        let stream = doc
5526            .get_object(ap_id)
5527            .expect("appearance stream")
5528            .as_stream()
5529            .expect("appearance stream");
5530        let content = String::from_utf8_lossy(&stream.content);
5531
5532        assert!(
5533            content.contains("YES"),
5534            "flatten should choose the selected normal appearance state"
5535        );
5536    }
5537
5538    #[test]
5539    fn widget_as_off_without_off_appearance_returns_none() {
5540        // When /AS is "Off" but the Normal appearance dict has no "Off" key,
5541        // the widget is deselected. Returning None avoids baking a checked
5542        // mark from the only remaining on-state appearance.
5543        let yes_stream = Object::Stream(Stream::new(
5544            dictionary! {
5545                "Type" => Object::Name(b"XObject".to_vec()),
5546                "Subtype" => Object::Name(b"Form".to_vec()),
5547                "BBox" => Object::Array(vec![
5548                    Object::Integer(0), Object::Integer(0),
5549                    Object::Integer(10), Object::Integer(10),
5550                ]),
5551            },
5552            b"q 5 5 m 5 5 l S Q\n".to_vec(),
5553        ));
5554
5555        let mut doc = Document::with_version("1.4");
5556        // Normal appearance has only a "0" key (checked state), no "Off" key.
5557        let state_id = doc.add_object(Object::Dictionary(dictionary! {
5558            "0" => yes_stream,
5559        }));
5560        let annot = dictionary! {
5561            "Subtype" => Object::Name(b"Widget".to_vec()),
5562            "Rect" => Object::Array(vec![
5563                Object::Integer(100), Object::Integer(700),
5564                Object::Integer(110), Object::Integer(710),
5565            ]),
5566            "AP" => Object::Dictionary(dictionary! {
5567                "N" => Object::Reference(state_id),
5568            }),
5569            "AS" => Object::Name(b"Off".to_vec()),
5570            "FT" => Object::Name(b"Btn".to_vec()),
5571        };
5572        assert!(
5573            resolve_widget_normal_appearance(&mut doc, &annot).is_none(),
5574            "Off state with no Off appearance should not resolve to the on-state stream"
5575        );
5576    }
5577
5578    #[test]
5579    fn bake_checkbox_radio_ap_marks_skips_off_widgets_without_off_normal_appearance() {
5580        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5581            Vec::new(),
5582            Object::Dictionary(dictionary! {
5583                "1" => Object::Stream(Stream::new(
5584                    dictionary! {
5585                        "Type" => Object::Name(b"XObject".to_vec()),
5586                        "Subtype" => Object::Name(b"Form".to_vec()),
5587                        "BBox" => Object::Array(vec![
5588                            Object::Integer(0), Object::Integer(0),
5589                            Object::Integer(10), Object::Integer(10),
5590                        ]),
5591                        "Resources" => Object::Dictionary(dictionary! {}),
5592                    },
5593                    b"q 1 1 8 8 re W n 2 8 m 8 2 l 8 8 m 2 2 l s Q\n".to_vec(),
5594                )),
5595            }),
5596            dictionary! {
5597                "FT" => Object::Name(b"Btn".to_vec()),
5598                "AS" => Object::Name(b"Off".to_vec()),
5599                "T" => Object::string_literal("checkbox[0]"),
5600            },
5601        );
5602
5603        let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5604        let page_id = doc.page_iter().next().expect("page");
5605        let baked = bake_checkbox_radio_ap_marks(&mut doc, page_id);
5606
5607        assert_eq!(baked, 0, "Off-state widget must not stamp the on-mark");
5608    }
5609
5610    #[test]
5611    fn adding_widget_xobject_preserves_indirect_inline_page_xobjects() {
5612        let mut doc = Document::with_version("1.4");
5613        let existing_xobject_id = doc.add_object(Object::Stream(Stream::new(
5614            dictionary! {
5615                "Type" => Object::Name(b"XObject".to_vec()),
5616                "Subtype" => Object::Name(b"Form".to_vec()),
5617                "BBox" => Object::Array(vec![
5618                    Object::Integer(0), Object::Integer(0),
5619                    Object::Integer(10), Object::Integer(10),
5620                ]),
5621            },
5622            b"q Q\n".to_vec(),
5623        )));
5624        let xobject_dict_id = doc.add_object(Object::Dictionary(dictionary! {
5625            "R11" => Object::Reference(existing_xobject_id),
5626        }));
5627
5628        let pages_id = doc.new_object_id();
5629        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5630            "Type" => Object::Name(b"Page".to_vec()),
5631            "Parent" => Object::Reference(pages_id),
5632            "MediaBox" => Object::Array(vec![
5633                Object::Integer(0), Object::Integer(0),
5634                Object::Integer(612), Object::Integer(792),
5635            ]),
5636            "Resources" => Object::Dictionary(dictionary! {
5637                "XObject" => Object::Reference(xobject_dict_id),
5638            }),
5639        }));
5640        doc.objects.insert(
5641            pages_id,
5642            Object::Dictionary(dictionary! {
5643                "Type"  => Object::Name(b"Pages".to_vec()),
5644                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5645                "Count" => Object::Integer(1)
5646            }),
5647        );
5648
5649        let new_xobject_id = doc.add_object(Object::Stream(Stream::new(
5650            dictionary! {
5651                "Type" => Object::Name(b"XObject".to_vec()),
5652                "Subtype" => Object::Name(b"Form".to_vec()),
5653                "BBox" => Object::Array(vec![
5654                    Object::Integer(0), Object::Integer(0),
5655                    Object::Integer(10), Object::Integer(10),
5656                ]),
5657            },
5658            b"0 0 10 10 re S\n".to_vec(),
5659        )));
5660
5661        add_xobject_to_page_resources(&mut doc, page_id, "XfaAp0", new_xobject_id);
5662
5663        let xobjects = doc
5664            .get_object(xobject_dict_id)
5665            .expect("xobject dict")
5666            .as_dict()
5667            .expect("xobject dict");
5668        assert!(
5669            xobjects.get(b"R11").is_ok(),
5670            "existing page XObject was lost"
5671        );
5672        assert!(
5673            xobjects.get(b"XfaAp0").is_ok(),
5674            "new flattened widget XObject was not added"
5675        );
5676    }
5677
5678    #[test]
5679    fn encrypted_pdf_without_xfa_returns_ok() {
5680        // Encrypted PDF without AcroForm/XFA → returned as-is (no XFA to flatten).
5681        let mut doc = Document::with_version("1.4");
5682        let pages_id = doc.new_object_id();
5683        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5684            "Type"     => Object::Name(b"Page".to_vec()),
5685            "Parent"   => Object::Reference(pages_id),
5686            "MediaBox" => Object::Array(vec![
5687                Object::Integer(0), Object::Integer(0),
5688                Object::Integer(612), Object::Integer(792),
5689            ]),
5690        }));
5691        doc.objects.insert(
5692            pages_id,
5693            Object::Dictionary(dictionary! {
5694                "Type"  => Object::Name(b"Pages".to_vec()),
5695                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5696                "Count" => Object::Integer(1),
5697            }),
5698        );
5699        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5700            "Type"  => Object::Name(b"Catalog".to_vec()),
5701            "Pages" => Object::Reference(pages_id),
5702        }));
5703        doc.trailer.set("Root", Object::Reference(catalog_id));
5704
5705        let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5706            "Filter" => Object::Name(b"Standard".to_vec()),
5707            "V"      => Object::Integer(2),
5708            "Length"  => Object::Integer(128),
5709        }));
5710        doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5711
5712        let mut buf = Vec::new();
5713        doc.save_to(&mut buf).expect("save test PDF");
5714
5715        let result = flatten_xfa_to_pdf(&buf);
5716        assert!(result.is_ok(), "non-XFA encrypted PDF should return Ok");
5717    }
5718
5719    #[test]
5720    fn encrypted_xfa_pdf_returns_encrypted_error() {
5721        // Encrypted PDF WITH AcroForm/XFA → should reach the decrypt check
5722        // and return Err(Encrypted) when the password is required.
5723        let mut doc = Document::with_version("1.4");
5724        let pages_id = doc.new_object_id();
5725        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5726            "Type"     => Object::Name(b"Page".to_vec()),
5727            "Parent"   => Object::Reference(pages_id),
5728            "MediaBox" => Object::Array(vec![
5729                Object::Integer(0), Object::Integer(0),
5730                Object::Integer(612), Object::Integer(792),
5731            ]),
5732        }));
5733        doc.objects.insert(
5734            pages_id,
5735            Object::Dictionary(dictionary! {
5736                "Type"  => Object::Name(b"Pages".to_vec()),
5737                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5738                "Count" => Object::Integer(1),
5739            }),
5740        );
5741        // Add AcroForm with XFA key so the byte-level pre-check passes.
5742        let xfa_stream_id = doc.add_object(Object::Stream(lopdf::Stream::new(
5743            dictionary! {},
5744            b"<xdp:xdp></xdp:xdp>".to_vec(),
5745        )));
5746        let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
5747            "XFA" => Object::Reference(xfa_stream_id),
5748        }));
5749        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5750            "Type"     => Object::Name(b"Catalog".to_vec()),
5751            "Pages"    => Object::Reference(pages_id),
5752            "AcroForm" => Object::Reference(acroform_id),
5753        }));
5754        doc.trailer.set("Root", Object::Reference(catalog_id));
5755
5756        let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5757            "Filter" => Object::Name(b"Standard".to_vec()),
5758            "V"      => Object::Integer(2),
5759            "Length"  => Object::Integer(128),
5760        }));
5761        doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5762
5763        let mut buf = Vec::new();
5764        doc.save_to(&mut buf).expect("save encrypted PDF");
5765
5766        let result = flatten_xfa_to_pdf(&buf);
5767        assert!(result.is_err(), "expected Encrypted error");
5768        let err = result.unwrap_err();
5769        assert!(
5770            matches!(err, XfaError::Encrypted(_)),
5771            "expected XfaError::Encrypted, got: {err:?}"
5772        );
5773    }
5774
5775    #[test]
5776    fn owner_only_encrypted_pdf_is_handled_transparently() {
5777        // Owner-only encrypted PDFs (empty user password) are auto-decrypted by lopdf.
5778        // Verify that flatten_xfa_to_pdf processes them without error.
5779        let mut doc = Document::with_version("2.0");
5780        let pages_id = doc.new_object_id();
5781        let page_id = doc.add_object(Object::Dictionary(dictionary! {
5782            "Type"     => Object::Name(b"Page".to_vec()),
5783            "Parent"   => Object::Reference(pages_id),
5784            "MediaBox" => Object::Array(vec![
5785                Object::Integer(0), Object::Integer(0),
5786                Object::Integer(612), Object::Integer(792),
5787            ]),
5788        }));
5789        doc.objects.insert(
5790            pages_id,
5791            Object::Dictionary(dictionary! {
5792                "Type"  => Object::Name(b"Pages".to_vec()),
5793                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
5794                "Count" => Object::Integer(1),
5795            }),
5796        );
5797        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5798            "Type"  => Object::Name(b"Catalog".to_vec()),
5799            "Pages" => Object::Reference(pages_id),
5800        }));
5801        doc.trailer.set("Root", Object::Reference(catalog_id));
5802
5803        // Encrypt with owner password "secret", empty user password.
5804        let state = lopdf::aes256_encryption_state("secret", "", lopdf::Permissions::default())
5805            .expect("create encryption state");
5806        doc.encrypt(&state).expect("encrypt document");
5807
5808        let mut buf = Vec::new();
5809        doc.save_to(&mut buf).expect("save encrypted PDF");
5810
5811        // lopdf auto-decrypts owner-only encrypted PDFs, so is_pdf_encrypted returns false.
5812        assert!(
5813            !is_pdf_encrypted(&buf),
5814            "lopdf should auto-decrypt owner-only PDFs"
5815        );
5816
5817        // flatten_xfa_to_pdf should succeed — no XFA content, returns input as-is.
5818        let result = flatten_xfa_to_pdf(&buf);
5819        assert!(
5820            result.is_ok(),
5821            "owner-only encrypted PDF should be handled, got: {result:?}"
5822        );
5823    }
5824
5825    /// Build a minimal PDF with a Type0 (CID) font that has a /W array.
5826    fn build_pdf_with_cid_font(w_array: Vec<Object>, dw: Option<i64>) -> Document {
5827        let mut doc = Document::with_version("1.4");
5828
5829        // Minimal CIDFont descendant dictionary with /W
5830        let mut cid_dict = dictionary! {
5831            "Type"    => Object::Name(b"Font".to_vec()),
5832            "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
5833            "BaseFont" => Object::Name(b"TestFont".to_vec()),
5834            "W"       => Object::Array(w_array)
5835        };
5836        if let Some(dw_val) = dw {
5837            cid_dict.set("DW", Object::Integer(dw_val));
5838        }
5839        let cid_id = doc.add_object(Object::Dictionary(cid_dict));
5840
5841        // Type0 composite font pointing to the CIDFont
5842        let type0_dict = dictionary! {
5843            "Type"            => Object::Name(b"Font".to_vec()),
5844            "Subtype"         => Object::Name(b"Type0".to_vec()),
5845            "BaseFont"        => Object::Name(b"TestFont".to_vec()),
5846            "DescendantFonts" => Object::Array(vec![Object::Reference(cid_id)])
5847        };
5848        doc.add_object(Object::Dictionary(type0_dict));
5849        doc
5850    }
5851
5852    /// Test CID /W array parsing: consecutive widths format.
5853    /// /W [120 [500 600 700]] → CID 120=500, CID 121=600, CID 122=700
5854    #[test]
5855    fn cid_w_array_consecutive() {
5856        let w = vec![
5857            Object::Integer(120),
5858            Object::Array(vec![
5859                Object::Integer(500),
5860                Object::Integer(600),
5861                Object::Integer(700),
5862            ]),
5863        ];
5864        let doc = build_pdf_with_cid_font(w, None);
5865        let _fonts = extract_embedded_fonts(&doc);
5866
5867        // No font stream embedded, so extract_embedded_fonts won't find data.
5868        // Test the parser directly via the Type0 dict.
5869        for obj in doc.objects.values() {
5870            let dict = match obj.as_dict() {
5871                Ok(d) => d,
5872                Err(_) => continue,
5873            };
5874            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5875            if subtype == Some(b"Type0".as_slice()) {
5876                let result = extract_cid_font_widths(&doc, dict);
5877                let (first, widths) = result.expect("should parse /W array");
5878                assert_eq!(first, 120);
5879                assert_eq!(widths.len(), 3);
5880                assert_eq!(widths[0], 500); // CID 120
5881                assert_eq!(widths[1], 600); // CID 121
5882                assert_eq!(widths[2], 700); // CID 122
5883                return;
5884            }
5885        }
5886        panic!("Type0 font not found in test document");
5887    }
5888
5889    /// Test CID /W array parsing: range format.
5890    /// /W [200 300 250] → CIDs 200-300 all have width 250
5891    #[test]
5892    fn cid_w_array_range() {
5893        let w = vec![
5894            Object::Integer(200),
5895            Object::Integer(300),
5896            Object::Integer(250),
5897        ];
5898        let doc = build_pdf_with_cid_font(w, None);
5899
5900        for obj in doc.objects.values() {
5901            let dict = match obj.as_dict() {
5902                Ok(d) => d,
5903                Err(_) => continue,
5904            };
5905            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5906            if subtype == Some(b"Type0".as_slice()) {
5907                let (first, widths) =
5908                    extract_cid_font_widths(&doc, dict).expect("should parse /W range");
5909                assert_eq!(first, 200);
5910                assert_eq!(widths.len(), 101); // 200..=300
5911                assert!(widths.iter().all(|&w| w == 250));
5912                return;
5913            }
5914        }
5915        panic!("Type0 font not found");
5916    }
5917
5918    /// Test CID /W array parsing: mixed consecutive + range formats.
5919    /// /W [120 [500 600 700] 200 300 250]
5920    /// CID 120=500, 121=600, 122=700, CIDs 200-300=250
5921    /// Default width (/DW) fills gaps (CIDs 123-199).
5922    #[test]
5923    fn cid_w_array_mixed() {
5924        let w = vec![
5925            Object::Integer(120),
5926            Object::Array(vec![
5927                Object::Integer(500),
5928                Object::Integer(600),
5929                Object::Integer(700),
5930            ]),
5931            Object::Integer(200),
5932            Object::Integer(300),
5933            Object::Integer(250),
5934        ];
5935        let doc = build_pdf_with_cid_font(w, Some(1000));
5936
5937        for obj in doc.objects.values() {
5938            let dict = match obj.as_dict() {
5939                Ok(d) => d,
5940                Err(_) => continue,
5941            };
5942            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5943            if subtype == Some(b"Type0".as_slice()) {
5944                let (first, widths) =
5945                    extract_cid_font_widths(&doc, dict).expect("should parse mixed /W");
5946                assert_eq!(first, 120);
5947                assert_eq!(widths.len(), 181); // 120..=300
5948                                               // Consecutive part
5949                assert_eq!(widths[0], 500); // CID 120
5950                assert_eq!(widths[1], 600); // CID 121
5951                assert_eq!(widths[2], 700); // CID 122
5952                                            // Gap filled with /DW=1000
5953                assert_eq!(widths[3], 1000); // CID 123
5954                assert_eq!(widths[79], 1000); // CID 199
5955                                              // Range part
5956                assert_eq!(widths[80], 250); // CID 200
5957                assert_eq!(widths[180], 250); // CID 300
5958                return;
5959            }
5960        }
5961        panic!("Type0 font not found");
5962    }
5963
5964    /// Test that /DW defaults to 1000 when not specified.
5965    #[test]
5966    fn cid_w_array_default_width() {
5967        let w = vec![
5968            Object::Integer(10),
5969            Object::Array(vec![Object::Integer(400)]),
5970            Object::Integer(20),
5971            Object::Array(vec![Object::Integer(600)]),
5972        ];
5973        let doc = build_pdf_with_cid_font(w, None); // no /DW → defaults to 1000
5974
5975        for obj in doc.objects.values() {
5976            let dict = match obj.as_dict() {
5977                Ok(d) => d,
5978                Err(_) => continue,
5979            };
5980            let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5981            if subtype == Some(b"Type0".as_slice()) {
5982                let (first, widths) = extract_cid_font_widths(&doc, dict).expect("should parse /W");
5983                assert_eq!(first, 10);
5984                assert_eq!(widths[0], 400); // CID 10
5985                assert_eq!(widths[5], 1000); // CID 15 — default
5986                assert_eq!(widths[10], 600); // CID 20
5987                return;
5988            }
5989        }
5990        panic!("Type0 font not found");
5991    }
5992
5993    #[test]
5994    fn extract_embedded_fonts_keeps_simple_pdf_fonts_without_fontfile() {
5995        let mut doc = Document::new();
5996        let font_id = doc.add_object(Object::Dictionary(dictionary! {
5997            "Type" => Object::Name(b"Font".to_vec()),
5998            "Subtype" => Object::Name(b"Type1".to_vec()),
5999            "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
6000            "FirstChar" => Object::Integer(32),
6001            "LastChar" => Object::Integer(34),
6002            "Widths" => Object::Array(vec![
6003                Object::Integer(278),
6004                Object::Integer(333),
6005                Object::Integer(612),
6006            ]),
6007            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
6008        }));
6009
6010        let fonts = extract_embedded_fonts(&doc);
6011        let font = fonts
6012            .iter()
6013            .find(|font| font.name == "MyriadPro-Regular")
6014            .expect("expected reusable simple font");
6015
6016        assert!(font.data.is_empty(), "no FontFile* should keep data empty");
6017        assert_eq!(font.pdf_widths, Some((32, vec![278, 333, 612])));
6018        assert_eq!(
6019            font.pdf_source_font,
6020            Some(PdfSourceFont { object_id: font_id })
6021        );
6022    }
6023
6024    #[test]
6025    fn store_font_data_reserves_family_alias_for_regular_face() {
6026        let mut fonts = Vec::new();
6027        store_font_data(
6028            &mut fonts,
6029            "ArialMT",
6030            Vec::new(),
6031            Some((32, vec![278, 333, 611])),
6032            None,
6033            Some(PdfSourceFont { object_id: (1, 0) }),
6034        );
6035        store_font_data(
6036            &mut fonts,
6037            "Arial-BoldMT",
6038            Vec::new(),
6039            Some((32, vec![278, 333, 611])),
6040            None,
6041            Some(PdfSourceFont { object_id: (2, 0) }),
6042        );
6043        store_font_data(
6044            &mut fonts,
6045            "Arial-ItalicMT",
6046            Vec::new(),
6047            Some((32, vec![278, 333, 611])),
6048            None,
6049            Some(PdfSourceFont { object_id: (3, 0) }),
6050        );
6051
6052        let aliases: Vec<_> = fonts.iter().map(|font| font.name.as_str()).collect();
6053        assert!(aliases.contains(&"ArialMT"));
6054        assert!(aliases.contains(&"Arial-BoldMT"));
6055        assert!(aliases.contains(&"Arial-ItalicMT"));
6056        assert_eq!(
6057            aliases.iter().filter(|name| **name == "Arial").count(),
6058            1,
6059            "only the regular face should claim the bare family alias"
6060        );
6061    }
6062
6063    #[test]
6064    fn store_font_data_keeps_regular_ps_family_alias() {
6065        let mut fonts = Vec::new();
6066        store_font_data(
6067            &mut fonts,
6068            "MyriadPro-Regular",
6069            Vec::new(),
6070            Some((32, vec![278, 333, 612])),
6071            None,
6072            Some(PdfSourceFont { object_id: (4, 0) }),
6073        );
6074
6075        assert!(
6076            fonts.iter().any(|font| font.name == "Myriad Pro"),
6077            "regular PostScript names should still expose their family alias"
6078        );
6079    }
6080
6081    #[test]
6082    fn page_content_streams_resolves_indirect_contents_arrays() {
6083        let mut doc = Document::new();
6084        let stream_a = doc.add_object(Stream::new(
6085            dictionary! {"Length" => Object::Integer(8)},
6086            b"(A) Tj\n".to_vec(),
6087        ));
6088        let stream_b = doc.add_object(Stream::new(
6089            dictionary! {"Length" => Object::Integer(8)},
6090            b"(B) Tj\n".to_vec(),
6091        ));
6092        let contents_array = doc.add_object(Object::Array(vec![
6093            Object::Reference(stream_a),
6094            Object::Reference(stream_b),
6095        ]));
6096        let page_id = doc.add_object(Object::Dictionary(dictionary! {
6097            "Type" => Object::Name(b"Page".to_vec()),
6098            "Contents" => Object::Reference(contents_array),
6099        }));
6100
6101        let streams = page_content_streams(&doc, page_id);
6102
6103        assert_eq!(
6104            streams.len(),
6105            2,
6106            "indirect /Contents arrays must be traversed"
6107        );
6108        assert!(streams[0].windows(2).any(|w| w == b"Tj"));
6109        assert!(streams[1].windows(2).any(|w| w == b"Tj"));
6110    }
6111
6112    #[test]
6113    fn embed_resolved_fonts_reuses_existing_pdf_font_object() {
6114        let mut doc = Document::new();
6115        let source_font_id = doc.add_object(Object::Dictionary(dictionary! {
6116            "Type" => Object::Name(b"Font".to_vec()),
6117            "Subtype" => Object::Name(b"Type1".to_vec()),
6118            "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
6119            "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
6120        }));
6121        let before = doc.objects.len();
6122
6123        let mut resolved = HashMap::new();
6124        resolved.insert(
6125            "Myriad Pro_Normal_Normal".to_string(),
6126            ResolvedFont {
6127                name: "Myriad Pro".to_string(),
6128                data: Vec::new(),
6129                face_index: 0,
6130                units_per_em: 1000,
6131                ascender: 800,
6132                descender: -200,
6133                pdf_widths: Some((32, vec![278, 333, 612])),
6134                pdf_encoding: None,
6135                pdf_source_font: Some(PdfSourceFont {
6136                    object_id: source_font_id,
6137                }),
6138            },
6139        );
6140
6141        let empty_layout = LayoutDom { pages: vec![] };
6142        let (_font_map, font_objects, metrics_data) =
6143            embed_resolved_fonts(&mut doc, &resolved, &empty_layout);
6144
6145        assert_eq!(
6146            doc.objects.len(),
6147            before,
6148            "should not embed a new font object"
6149        );
6150        assert_eq!(font_objects.len(), 1);
6151        assert_eq!(font_objects[0].1, source_font_id);
6152        assert!(
6153            metrics_data["Myriad Pro_Normal_Normal"].font_data.is_none(),
6154            "reused simple fonts must keep WinAnsi text encoding"
6155        );
6156    }
6157
6158    #[test]
6159    fn strip_undefined_entities_preserves_raw_ampersands_in_processing_instructions() {
6160        let xml = r##"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><?renderCache.textRun 24 A. Adjustment & Location 0 1417 14917 0 0 0 "Myriad Pro" 0 0 18000 ISO-8859-1?><?renderCache.subset "Arial" 0 0 ISO-8859-1 "#$%&'()+,-./" ?><subform name="form1"><field name="A"/></subform></template>"##;
6161
6162        let stripped = strip_undefined_xml_entities(xml);
6163
6164        assert_eq!(
6165            stripped, xml,
6166            "raw ampersands inside processing instructions are valid and must survive sanitization"
6167        );
6168        roxmltree::Document::parse(&stripped)
6169            .expect("processing instructions must remain parseable");
6170    }
6171
6172    #[test]
6173    fn strip_undefined_entities_drops_only_true_named_entity_references() {
6174        let xml = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="form1"><draw name="D"><value><text>alpha &bogus; beta &#169; &amp; gamma</text></value></draw></subform></template>"#;
6175
6176        let stripped = strip_undefined_xml_entities(xml);
6177
6178        assert!(
6179            !stripped.contains("&bogus;"),
6180            "unknown named entities should still be removed for roxmltree compatibility"
6181        );
6182        assert!(stripped.contains("&#169;"));
6183        assert!(stripped.contains("&amp;"));
6184        roxmltree::Document::parse(&stripped).expect("sanitized XML should parse");
6185    }
6186
6187    /// Form DOM with more repeating instances than the template must expand
6188    /// the FormTree and populate field values.
6189    #[test]
6190    fn form_dom_expands_repeating_subform_instances() {
6191        use xfa_layout_engine::form::FormNodeType;
6192
6193        // Template: one Activity subform with bind=none, occur max=-1
6194        let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6195          <subform name="root" layout="tb">
6196            <pageSet><pageArea name="P1">
6197              <contentArea w="200mm" h="280mm"/>
6198              <medium short="210mm" long="297mm"/>
6199            </pageArea></pageSet>
6200            <subform name="body" layout="tb">
6201              <subform name="Items" layout="tb">
6202                <bind match="none"/>
6203                <subform name="Row" layout="tb">
6204                  <bind match="none"/>
6205                  <occur max="-1"/>
6206                  <field name="Label"><ui><textEdit/></ui></field>
6207                </subform>
6208              </subform>
6209            </subform>
6210          </subform>
6211        </template>"#;
6212
6213        // Form DOM: 3 Row instances with values
6214        let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6215          <subform name="root">
6216            <subform name="body">
6217              <subform name="Items">
6218                <instanceManager name="_Row"/>
6219                <subform name="Row">
6220                  <field name="Label"><value><text>Alpha</text></value></field>
6221                </subform>
6222                <subform name="Row">
6223                  <field name="Label"><value><text>Beta</text></value></field>
6224                </subform>
6225                <subform name="Row">
6226                  <field name="Label"><value><text>Gamma</text></value></field>
6227                </subform>
6228              </subform>
6229            </subform>
6230          </subform>
6231        </form>"#;
6232
6233        let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6234        let merger = crate::merger::FormMerger::new(&data_dom);
6235        let (mut tree, root_id) = merger.merge(template).unwrap();
6236
6237        // Before form DOM: only 1 Row instance
6238        // Dump tree to understand structure
6239        fn find_by_name(tree: &FormTree, parent: FormNodeId, name: &str) -> Option<FormNodeId> {
6240            for &c in &tree.get(parent).children {
6241                if tree.get(c).name == name {
6242                    return Some(c);
6243                }
6244                if let Some(found) = find_by_name(tree, c, name) {
6245                    return Some(found);
6246                }
6247            }
6248            None
6249        }
6250        let items_id =
6251            find_by_name(&tree, root_id, "Items").expect("Items subform not found in tree");
6252        let rows_before = tree
6253            .get(items_id)
6254            .children
6255            .iter()
6256            .filter(|&&c| tree.get(c).name == "Row")
6257            .count();
6258        assert_eq!(
6259            rows_before, 1,
6260            "template merge should produce 1 Row (bind=none)"
6261        );
6262
6263        // Apply form DOM
6264        let _ = apply_form_dom_presence(
6265            &mut tree,
6266            root_id,
6267            form_xml,
6268            XfaRenderingPolicy::SavedStateFaithful,
6269            false,
6270        );
6271
6272        // After form DOM: 3 Row instances with correct values
6273        let rows_after: Vec<FormNodeId> = tree
6274            .get(items_id)
6275            .children
6276            .iter()
6277            .filter(|&&c| tree.get(c).name == "Row")
6278            .copied()
6279            .collect();
6280        assert_eq!(
6281            rows_after.len(),
6282            3,
6283            "form DOM should expand to 3 Row instances"
6284        );
6285
6286        let values: Vec<String> = rows_after
6287            .iter()
6288            .map(|&row_id| {
6289                let label_id = tree.get(row_id).children[0];
6290                match &tree.get(label_id).node_type {
6291                    FormNodeType::Field { value } => value.clone(),
6292                    _ => String::new(),
6293                }
6294            })
6295            .collect();
6296        assert_eq!(values, vec!["Alpha", "Beta", "Gamma"]);
6297    }
6298
6299    /// XFA 3.3 §8.6 / §3.1 — pageArea expansion from form-DOM.
6300    ///
6301    /// When the form-DOM packet records multiple instances of a *single*
6302    /// pageArea template, the FormTree must clone the template once per
6303    /// recorded instance and tag the clones as `runtime_instantiated_page`.
6304    /// This guards against the regression that produced 5 pages instead of
6305    /// 10 on corpus doc 13275420c3c9afbb.
6306    #[test]
6307    fn form_dom_expands_uniform_page_area_template() {
6308        use xfa_layout_engine::form::FormNodeType;
6309
6310        let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6311          <subform name="root" layout="tb">
6312            <pageSet>
6313              <pageArea name="Page1">
6314                <contentArea w="200mm" h="280mm"/>
6315                <medium short="210mm" long="297mm"/>
6316              </pageArea>
6317            </pageSet>
6318          </subform>
6319        </template>"#;
6320
6321        let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6322          <subform name="root">
6323            <pageSet>
6324              <pageArea name="Page1"/>
6325              <pageArea name="Page1"/>
6326              <pageArea name="Page1"/>
6327              <pageArea name="Page1"/>
6328              <pageArea name="Page1"/>
6329            </pageSet>
6330          </subform>
6331        </form>"#;
6332
6333        let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6334        let merger = crate::merger::FormMerger::new(&data_dom);
6335        let (mut tree, root_id) = merger.merge(template).unwrap();
6336
6337        let _ = apply_form_dom_presence(
6338            &mut tree,
6339            root_id,
6340            form_xml,
6341            XfaRenderingPolicy::SavedStateFaithful,
6342            false,
6343        );
6344
6345        fn collect_page_areas(tree: &FormTree, id: FormNodeId, out: &mut Vec<FormNodeId>) {
6346            if matches!(tree.get(id).node_type, FormNodeType::PageArea { .. }) {
6347                out.push(id);
6348            }
6349            for &c in &tree.get(id).children {
6350                collect_page_areas(tree, c, out);
6351            }
6352        }
6353
6354        let mut page_areas = Vec::new();
6355        collect_page_areas(&tree, root_id, &mut page_areas);
6356        assert_eq!(
6357            page_areas.len(),
6358            5,
6359            "uniform pageArea expansion: 5 form-DOM instances must clone the template"
6360        );
6361        for &pa_id in &page_areas {
6362            assert!(
6363                tree.meta(pa_id).runtime_instantiated_page,
6364                "every expanded pageArea must be flagged as runtime-instantiated"
6365            );
6366        }
6367    }
6368
6369    /// Multi-template pageSets (e.g. `Page1` + `OverFlowPage`) MUST NOT
6370    /// trigger pageArea expansion — those pageAreas are a pre-allocated
6371    /// menu, not a uniform repetition pattern, and replicating them would
6372    /// over-paginate.
6373    #[test]
6374    fn form_dom_skips_multi_template_page_area_expansion() {
6375        use xfa_layout_engine::form::FormNodeType;
6376
6377        let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6378          <subform name="root" layout="tb">
6379            <pageSet>
6380              <pageArea name="Page1">
6381                <contentArea w="200mm" h="280mm"/>
6382                <medium short="210mm" long="297mm"/>
6383              </pageArea>
6384              <pageArea name="OverFlowPage">
6385                <contentArea w="200mm" h="280mm"/>
6386                <medium short="210mm" long="297mm"/>
6387              </pageArea>
6388            </pageSet>
6389          </subform>
6390        </template>"#;
6391
6392        let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6393          <subform name="root">
6394            <pageSet>
6395              <pageArea name="Page1"/>
6396              <pageArea name="OverFlowPage"/>
6397              <pageArea name="OverFlowPage"/>
6398              <pageArea name="OverFlowPage"/>
6399              <pageArea name="OverFlowPage"/>
6400            </pageSet>
6401          </subform>
6402        </form>"#;
6403
6404        let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6405        let merger = crate::merger::FormMerger::new(&data_dom);
6406        let (mut tree, root_id) = merger.merge(template).unwrap();
6407
6408        let _ = apply_form_dom_presence(
6409            &mut tree,
6410            root_id,
6411            form_xml,
6412            XfaRenderingPolicy::SavedStateFaithful,
6413            false,
6414        );
6415
6416        fn collect_page_areas(tree: &FormTree, id: FormNodeId, out: &mut Vec<FormNodeId>) {
6417            if matches!(tree.get(id).node_type, FormNodeType::PageArea { .. }) {
6418                out.push(id);
6419            }
6420            for &c in &tree.get(id).children {
6421                collect_page_areas(tree, c, out);
6422            }
6423        }
6424
6425        let mut page_areas = Vec::new();
6426        collect_page_areas(&tree, root_id, &mut page_areas);
6427        assert_eq!(
6428            page_areas.len(),
6429            2,
6430            "multi-template pageSet must not clone pageAreas (kept original 2)"
6431        );
6432        for &pa_id in &page_areas {
6433            assert!(
6434                !tree.meta(pa_id).runtime_instantiated_page,
6435                "non-expansion case must not set runtime_instantiated_page flag"
6436            );
6437        }
6438    }
6439
6440    /// `XFA_FORMDOM_ADMIT_DATABOUND` (default-off) admit gate.
6441    ///
6442    /// The §3.1 unmatched-subform suppression hides named template subforms
6443    /// the saved form DOM did not enumerate. For DATA-BOUND unmatched
6444    /// subforms that is wrong — Adobe renders them. The default-off
6445    /// production override routes those through the same guarded admit branch
6446    /// as `FreshMergeExperimental`, WITHOUT flipping the policy default and
6447    /// WITHOUT loosening the guard: a truly-unmatched NON-data subform (no
6448    /// `bound_data_node`) must stay suppressed in every mode — that is the
6449    /// over-pagination guard.
6450    ///
6451    /// The override is passed as an explicit bool (the env read happens at the
6452    /// pipeline boundary), so this test is deterministic and touches no env.
6453    #[test]
6454    fn formdom_admit_databound_override_admits_only_data_bound() {
6455        use xfa_layout_engine::form::{FormNodeType, Presence};
6456
6457        // root > body > { Bound (data-bound), Unbound (no data), Present }.
6458        let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6459          <subform name="root" layout="tb">
6460            <subform name="body" layout="tb">
6461              <subform name="Bound" layout="tb">
6462                <field name="A"><ui><textEdit/></ui></field>
6463              </subform>
6464              <subform name="Unbound" layout="tb">
6465                <field name="B"><ui><textEdit/></ui></field>
6466              </subform>
6467              <subform name="Present" layout="tb">
6468                <field name="C"><ui><textEdit/></ui></field>
6469              </subform>
6470            </subform>
6471          </subform>
6472        </template>"#;
6473
6474        // Saved form DOM enumerates only `Present` under `body`, so `Bound`
6475        // and `Unbound` are unmatched §3.1 suppression candidates; `body`
6476        // lists a subform child, so `has_subform_children` is true.
6477        let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6478          <subform name="root">
6479            <subform name="body">
6480              <subform name="Present">
6481                <field name="C"><value><text>x</text></value></field>
6482              </subform>
6483            </subform>
6484          </subform>
6485        </form>"#;
6486
6487        fn find(tree: &FormTree, parent: FormNodeId, name: &str) -> Option<FormNodeId> {
6488            for &c in &tree.get(parent).children {
6489                if tree.get(c).name == name {
6490                    return Some(c);
6491                }
6492                if let Some(f) = find(tree, c, name) {
6493                    return Some(f);
6494                }
6495            }
6496            None
6497        }
6498
6499        // Build a fresh tree, install the data-binding scenario explicitly
6500        // (independent of merge defaults), then run `apply_form_dom_presence`.
6501        // Returns (admitted_count, Bound.presence, Unbound.presence).
6502        let scenario = |policy: XfaRenderingPolicy, override_on: bool| {
6503            let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6504            let merger = crate::merger::FormMerger::new(&data_dom);
6505            let (mut tree, root_id) = merger.merge(template).unwrap();
6506
6507            let bound = find(&tree, root_id, "Bound").expect("Bound subform in tree");
6508            let unbound = find(&tree, root_id, "Unbound").expect("Unbound subform in tree");
6509            assert!(matches!(tree.get(bound).node_type, FormNodeType::Subform));
6510            assert!(matches!(tree.get(unbound).node_type, FormNodeType::Subform));
6511
6512            for &id in &[bound, unbound] {
6513                let m = tree.meta_mut(id);
6514                m.presence = Presence::Visible;
6515                m.is_zero_instance_prototype = false;
6516                m.data_bind_none = false;
6517            }
6518            tree.meta_mut(bound).bound_data_node = Some(0); // data-bound
6519            tree.meta_mut(unbound).bound_data_node = None; // no data node
6520
6521            let (admitted, _, _) =
6522                apply_form_dom_presence(&mut tree, root_id, form_xml, policy, override_on);
6523            (
6524                admitted,
6525                tree.meta(bound).presence,
6526                tree.meta(unbound).presence,
6527            )
6528        };
6529
6530        // 1. Production default (SavedStateFaithful, override OFF): both the
6531        //    data-bound and the non-data unmatched subform are suppressed —
6532        //    unchanged baseline behaviour.
6533        let (adm, bound_p, unbound_p) = scenario(XfaRenderingPolicy::SavedStateFaithful, false);
6534        assert_eq!(
6535            adm, 0,
6536            "override off admits nothing under SavedStateFaithful"
6537        );
6538        assert_eq!(
6539            bound_p,
6540            Presence::Hidden,
6541            "data-bound suppressed when override off"
6542        );
6543        assert_eq!(
6544            unbound_p,
6545            Presence::Hidden,
6546            "non-data suppressed when override off"
6547        );
6548
6549        // 2. Override ON under SavedStateFaithful: the data-bound subform is
6550        //    admitted (presence preserved); the non-data subform stays
6551        //    suppressed — the over-pagination guard holds.
6552        let (adm, bound_p, unbound_p) = scenario(XfaRenderingPolicy::SavedStateFaithful, true);
6553        assert_eq!(adm, 1, "override on admits exactly the data-bound subform");
6554        assert_eq!(
6555            bound_p,
6556            Presence::Visible,
6557            "data-bound admitted when override on"
6558        );
6559        assert_eq!(
6560            unbound_p,
6561            Presence::Hidden,
6562            "non-data subform must stay suppressed with override on (over-pagination guard)"
6563        );
6564
6565        // 3. FreshMergeExperimental (override OFF) admits the same set — the
6566        //    override merely extends this to the production policy.
6567        let (adm, bound_p, unbound_p) = scenario(XfaRenderingPolicy::FreshMergeExperimental, false);
6568        assert_eq!(
6569            adm, 1,
6570            "fresh-merge admits the data-bound subform without override"
6571        );
6572        assert_eq!(
6573            bound_p,
6574            Presence::Visible,
6575            "data-bound admitted under fresh-merge"
6576        );
6577        assert_eq!(
6578            unbound_p,
6579            Presence::Hidden,
6580            "non-data suppressed under fresh-merge"
6581        );
6582    }
6583
6584    // GL-QA36: verify the re-entrance guard prevents infinite recursion.
6585    //
6586    // We call flatten_xfa_to_pdf_simulate_reentrant (a #[cfg(test)] helper
6587    // that sets FLATTEN_DEPTH=1 before calling) to avoid accessing the
6588    // thread-local directly from the test sub-module.
6589    #[test]
6590    fn flatten_xfa_to_pdf_recursion_guard_returns_error() {
6591        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6592        let result = flatten_xfa_to_pdf_simulate_reentrant(&pdf_bytes);
6593        assert!(
6594            result.is_err(),
6595            "expected recursion guard to return Err, got Ok"
6596        );
6597        let err_msg = result.unwrap_err().to_string();
6598        assert!(
6599            err_msg.contains("recursively"),
6600            "expected error message to mention recursion, got: {err_msg}"
6601        );
6602    }
6603
6604    // GL-QA36: verify the depth counter is reset to 0 after a normal call so
6605    // subsequent calls on the same thread are not falsely blocked.
6606    #[test]
6607    fn flatten_xfa_to_pdf_depth_counter_resets_after_call() {
6608        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6609        // First call; should succeed and reset depth to 0.
6610        let _ = flatten_xfa_to_pdf(&pdf_bytes);
6611        // Second call must not be blocked by a leaked counter.
6612        let pdf_bytes2 = build_xfa_pdf(SIMPLE_XDP);
6613        let result = flatten_xfa_to_pdf(&pdf_bytes2);
6614        assert!(
6615            result.is_ok(),
6616            "second flatten call should succeed, got: {result:?}"
6617        );
6618    }
6619
6620    // XFA-F1-05 (issue #1088): flatten_xfa_to_pdf must never panic on empty input.
6621    // An empty byte slice is not a valid PDF, so the function should return an
6622    // error rather than panic or crash.
6623    #[test]
6624    fn flatten_xfa_to_pdf_does_not_panic_on_empty_input() {
6625        let result = flatten_xfa_to_pdf(&[]);
6626        // We only require it does not panic; an Err is perfectly acceptable.
6627        // (An Ok result would mean the PDF library accepts empty bytes, which
6628        //  would also be fine — the important invariant is no panic/abort.)
6629        let _ = result;
6630    }
6631
6632    // -----------------------------------------------------------------------
6633    // XFA-F6-01 (#1109): Pipeline contract — minimal well-formed XFA PDF
6634    // -----------------------------------------------------------------------
6635
6636    /// XFA-F6-01: the flatten pipeline completes without panicking on a
6637    /// minimal well-formed XFA PDF. This exercises all pipeline stages and
6638    /// verifies the debug_assert ordering constraints hold.
6639    #[test]
6640    fn flatten_pipeline_completes_on_minimal_xfa_pdf() {
6641        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6642        // The pipeline must not panic (debug_assert violations would panic in
6643        // debug builds). We do not require Ok — layout failure → static_fallback
6644        // is acceptable, the important invariant is no panic.
6645        let result = flatten_xfa_to_pdf(&pdf_bytes);
6646        let _ = result; // Ok or Err both acceptable; panic is not
6647    }
6648
6649    #[test]
6650    fn flatten_with_layout_dump_preserves_pdf_bytes() {
6651        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6652        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("plain flatten should succeed");
6653        let (flattened_with_dump, layout_dump) =
6654            flatten_xfa_to_pdf_with_layout_dump(&pdf_bytes).expect("dump flatten should succeed");
6655
6656        assert_eq!(flattened_with_dump, flattened);
6657        assert!(!layout_dump.pages.is_empty());
6658        assert_eq!(layout_dump.pages[0].page_num, 1);
6659        assert!(layout_dump.pages[0].used_height <= layout_dump.pages[0].page_height);
6660    }
6661
6662    // -----------------------------------------------------------------------
6663    // XFA-F6-02 (#1110): AcroForm/XFA removal tests
6664    // -----------------------------------------------------------------------
6665
6666    /// After flattening an XFA PDF, the output must not contain /NeedsRendering.
6667    #[test]
6668    fn flatten_removes_needs_rendering() {
6669        // Build a PDF with NeedsRendering in the catalog.
6670        let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6671        // Insert NeedsRendering into the catalog via lopdf.
6672        {
6673            let mut doc = Document::load_mem(&pdf_bytes).expect("parse for NeedsRendering test");
6674            let root_id = match doc.trailer.get(b"Root") {
6675                Ok(Object::Reference(id)) => *id,
6676                _ => panic!("no Root in trailer"),
6677            };
6678            if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
6679                dict.set("NeedsRendering", Object::Boolean(true));
6680            }
6681            let mut out = Vec::new();
6682            doc.save_to(&mut out)
6683                .expect("re-save for NeedsRendering test");
6684            pdf_bytes = out;
6685        }
6686
6687        // Flatten should strip NeedsRendering.
6688        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
6689        let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
6690        let root_id = match doc.trailer.get(b"Root") {
6691            Ok(Object::Reference(id)) => *id,
6692            _ => panic!("no Root in flattened trailer"),
6693        };
6694        let catalog = doc.get_dictionary(root_id).expect("catalog dict");
6695        assert!(
6696            catalog.get(b"NeedsRendering").is_err(),
6697            "/NeedsRendering must be absent after flatten"
6698        );
6699    }
6700
6701    /// After flattening an XFA PDF, the output must not contain /XFA anywhere
6702    /// in the catalog or AcroForm dictionary.
6703    #[test]
6704    fn flatten_removes_xfa_entry() {
6705        let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6706        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
6707
6708        // Search the serialised bytes for /XFA — the key must not appear.
6709        // We look for " /XFA" / "\n/XFA" patterns in the raw output.
6710        let flattened_str = String::from_utf8_lossy(&flattened);
6711        assert!(
6712            !flattened_str.contains("/XFA"),
6713            "/XFA must be absent from flattened output, but was found"
6714        );
6715    }
6716
6717    #[test]
6718    fn remove_acroform_purges_xfa_packet_objects() {
6719        let (mut doc, acroform_id, xfa_ids) = build_xfa_doc_with_xfa_array();
6720
6721        remove_acroform(&mut doc);
6722
6723        assert!(
6724            !doc.objects.contains_key(&acroform_id),
6725            "AcroForm object should be removed from doc.objects"
6726        );
6727        for xfa_id in &xfa_ids {
6728            assert!(
6729                !doc.objects.contains_key(xfa_id),
6730                "XFA packet object {xfa_id:?} should be removed from doc.objects"
6731            );
6732        }
6733
6734        let mut out = Vec::new();
6735        doc.save_to(&mut out).expect("save cleaned PDF");
6736        let out_str = String::from_utf8_lossy(&out);
6737        assert!(
6738            !out_str.contains("xdp:xdp"),
6739            "serialized output should not contain orphaned XFA packet payloads"
6740        );
6741        assert!(
6742            !out_str.contains("<template"),
6743            "serialized output should not contain orphaned template payloads"
6744        );
6745    }
6746
6747    /// After flattening, there must be no empty /Annots arrays in the output.
6748    #[test]
6749    fn flatten_removes_empty_annots_arrays() {
6750        // Build a PDF with an empty Annots array on the page.
6751        let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6752        {
6753            let mut doc = Document::load_mem(&pdf_bytes).expect("parse for annots test");
6754            let page_id = doc.page_iter().next().expect("at least one page");
6755            if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
6756                dict.set("Annots", Object::Array(vec![]));
6757            }
6758            let mut out = Vec::new();
6759            doc.save_to(&mut out).expect("re-save for annots test");
6760            pdf_bytes = out;
6761        }
6762
6763        let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
6764        let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
6765        for page_id in doc.page_iter() {
6766            let page = doc.get_dictionary(page_id).expect("page dict");
6767            if let Ok(Object::Array(arr)) = page.get(b"Annots") {
6768                // absent = good; only a present /Annots must be non-empty
6769                assert!(
6770                    !arr.is_empty(),
6771                    "page {:?}: /Annots must either be absent or non-empty after flatten",
6772                    page_id
6773                );
6774            }
6775        }
6776    }
6777
6778    #[test]
6779    fn remove_acroform_strips_widgets_from_indirect_annots_arrays() {
6780        let appearance = Object::Stream(Stream::new(
6781            dictionary! {
6782                "Type" => Object::Name(b"XObject".to_vec()),
6783                "Subtype" => Object::Name(b"Form".to_vec()),
6784                "BBox" => Object::Array(vec![
6785                    Object::Integer(0), Object::Integer(0),
6786                    Object::Integer(20), Object::Integer(20),
6787                ]),
6788                "Resources" => Object::Dictionary(dictionary! {}),
6789            },
6790            b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
6791        ));
6792        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
6793            Vec::new(),
6794            appearance,
6795            dictionary! {
6796                "FT" => Object::Name(b"Tx".to_vec()),
6797                "T" => Object::string_literal("field[0]"),
6798            },
6799        );
6800
6801        let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
6802        let page_id = doc.page_iter().next().expect("page");
6803        let annots = page_annotations(&doc, page_id);
6804        let annots_id = doc.add_object(Object::Array(annots));
6805        if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
6806            page_dict.set("Annots", Object::Reference(annots_id));
6807        }
6808
6809        remove_acroform(&mut doc);
6810
6811        let page = doc.get_dictionary(page_id).expect("page dict");
6812        assert!(
6813            page.get(b"Annots").is_err(),
6814            "widget-only indirect /Annots must be removed"
6815        );
6816    }
6817
6818    #[test]
6819    fn acroform_without_xfa_falls_back_to_static_cleanup() {
6820        let appearance = Object::Stream(Stream::new(
6821            dictionary! {
6822                "Type" => Object::Name(b"XObject".to_vec()),
6823                "Subtype" => Object::Name(b"Form".to_vec()),
6824                "BBox" => Object::Array(vec![
6825                    Object::Integer(0), Object::Integer(0),
6826                    Object::Integer(20), Object::Integer(20),
6827                ]),
6828                "Resources" => Object::Dictionary(dictionary! {}),
6829            },
6830            b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
6831        ));
6832        let pdf_bytes = build_xfa_pdf_with_widget_appearance(
6833            Vec::new(),
6834            appearance,
6835            dictionary! {
6836                "FT" => Object::Name(b"Tx".to_vec()),
6837                "T" => Object::string_literal("field[0]"),
6838            },
6839        );
6840
6841        let mut doc = Document::load_mem(&pdf_bytes).expect("parse source PDF");
6842        let root_id = match doc.trailer.get(b"Root") {
6843            Ok(Object::Reference(id)) => *id,
6844            _ => panic!("no Root"),
6845        };
6846        let acroform_id = doc
6847            .get_dictionary(root_id)
6848            .expect("catalog")
6849            .get(b"AcroForm")
6850            .expect("AcroForm")
6851            .as_reference()
6852            .expect("AcroForm ref");
6853        if let Ok(Object::Dictionary(ref mut acroform)) = doc.get_object_mut(acroform_id) {
6854            acroform.remove(b"XFA");
6855        }
6856        let mut acroform_only = Vec::new();
6857        doc.save_to(&mut acroform_only)
6858            .expect("save AcroForm-only PDF");
6859
6860        let flattened = flatten_xfa_to_pdf(&acroform_only).expect("flatten failed");
6861        let flattened_doc = Document::load_mem(&flattened).expect("parse flattened PDF");
6862        let root_id = match flattened_doc.trailer.get(b"Root") {
6863            Ok(Object::Reference(id)) => *id,
6864            _ => panic!("no Root in flattened PDF"),
6865        };
6866        let catalog = flattened_doc
6867            .get_dictionary(root_id)
6868            .expect("flattened catalog");
6869        assert!(
6870            catalog.get(b"AcroForm").is_err(),
6871            "AcroForm-only PDFs should still be cleaned by flatten"
6872        );
6873
6874        let page_id = flattened_doc.page_iter().next().expect("flattened page");
6875        assert!(
6876            page_annotations(&flattened_doc, page_id).is_empty(),
6877            "flattened AcroForm-only PDFs should not retain widget annotations"
6878        );
6879    }
6880
6881    // -----------------------------------------------------------------------
6882    // XFA-F6-03 (#1111): validate_flattened_pdf tests
6883    // -----------------------------------------------------------------------
6884
6885    /// A clean (non-XFA) PDF must pass validation with no warnings.
6886    #[test]
6887    fn validate_flattened_pdf_clean_pdf_passes() {
6888        // Build the minimal PDF document (no AcroForm/XFA).
6889        let mut doc = Document::with_version("1.4");
6890        let pages_id = doc.new_object_id();
6891        let page_id = doc.add_object(Object::Dictionary(dictionary! {
6892            "Type"     => Object::Name(b"Page".to_vec()),
6893            "Parent"   => Object::Reference(pages_id),
6894            "MediaBox" => Object::Array(vec![
6895                Object::Integer(0), Object::Integer(0),
6896                Object::Integer(612), Object::Integer(792),
6897            ])
6898        }));
6899        doc.objects.insert(
6900            pages_id,
6901            Object::Dictionary(dictionary! {
6902                "Type"  => Object::Name(b"Pages".to_vec()),
6903                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
6904                "Count" => Object::Integer(1)
6905            }),
6906        );
6907        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
6908            "Type"  => Object::Name(b"Catalog".to_vec()),
6909            "Pages" => Object::Reference(pages_id)
6910        }));
6911        doc.trailer.set("Root", Object::Reference(catalog_id));
6912        let mut pdf_bytes = Vec::new();
6913        doc.save_to(&mut pdf_bytes).expect("save clean PDF");
6914
6915        let validation = validate_flattened_pdf(&pdf_bytes).expect("validate failed");
6916        assert!(
6917            validation.has_no_acroform,
6918            "clean PDF should have no AcroForm"
6919        );
6920        assert!(validation.has_no_xfa, "clean PDF should have no XFA");
6921        assert!(
6922            validation.has_no_needs_rendering,
6923            "clean PDF should have no NeedsRendering"
6924        );
6925        assert_eq!(validation.page_count, 1, "clean PDF should report 1 page");
6926        assert!(
6927            validation.warnings.is_empty(),
6928            "clean PDF should produce no warnings, got: {:?}",
6929            validation.warnings
6930        );
6931    }
6932
6933    /// validate_flattened_pdf must not panic on empty input.
6934    #[test]
6935    fn validate_flattened_pdf_does_not_panic_on_empty_input() {
6936        let result = validate_flattened_pdf(&[]);
6937        // Should return Ok with a warning, not panic.
6938        assert!(
6939            result.is_ok(),
6940            "expected Ok from empty input, got: {:?}",
6941            result.err()
6942        );
6943        let v = result.unwrap();
6944        assert_eq!(v.page_count, 0, "empty input has 0 pages");
6945        assert!(
6946            !v.warnings.is_empty(),
6947            "empty input should produce at least one warning"
6948        );
6949    }
6950
6951    // -----------------------------------------------------------------------
6952    // XFA-F6-04 (#1112): compare_flatten_quality tests
6953    // -----------------------------------------------------------------------
6954
6955    /// Page count comparison works correctly via compare_flatten_quality.
6956    #[test]
6957    fn compare_flatten_quality_page_count_comparison() {
6958        let original = build_xfa_pdf(SIMPLE_XDP);
6959        let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6960        let metrics =
6961            compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6962        // Both before and after must parse to at least 1 page.
6963        assert!(
6964            metrics.page_count_before >= 1,
6965            "original must have >= 1 page"
6966        );
6967        assert!(
6968            metrics.page_count_after >= 1,
6969            "flattened must have >= 1 page"
6970        );
6971        // page_count_match must reflect equality.
6972        assert_eq!(
6973            metrics.page_count_match,
6974            metrics.page_count_before == metrics.page_count_after,
6975            "page_count_match must equal page_count_before == page_count_after"
6976        );
6977    }
6978
6979    /// Content ratio is computed correctly.
6980    #[test]
6981    fn compare_flatten_quality_content_ratio_computed() {
6982        let original = build_xfa_pdf(SIMPLE_XDP);
6983        let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6984        let metrics =
6985            compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6986        // Ratio should be a finite positive number.
6987        assert!(
6988            metrics.content_ratio.is_finite() && metrics.content_ratio >= 0.0,
6989            "content_ratio must be finite and >= 0, got: {}",
6990            metrics.content_ratio
6991        );
6992        // Verify the ratio matches the raw values.
6993        let expected = if metrics.content_stream_bytes_before == 0 {
6994            1.0_f64
6995        } else {
6996            metrics.content_stream_bytes_after as f64 / metrics.content_stream_bytes_before as f64
6997        };
6998        assert!(
6999            (metrics.content_ratio - expected).abs() < 1e-9,
7000            "content_ratio mismatch: expected {expected}, got {}",
7001            metrics.content_ratio
7002        );
7003    }
7004
7005    // -----------------------------------------------------------------------
7006    // XFA-F7-02 (#1114): validate_text_completeness tests
7007    // -----------------------------------------------------------------------
7008
7009    /// validate_text_completeness returns completeness_ratio = 1.0 when the
7010    /// original XFA bytes have no datasets packet (nothing to check).
7011    #[test]
7012    fn validate_text_completeness_no_datasets_returns_perfect_ratio() {
7013        // Build an XFA PDF whose XDP has no <datasets> packet — just a template.
7014        let xdp = r#"<?xml version="1.0"?>
7015<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
7016  <template>
7017    <subform name="root">
7018      <field name="greeting"><ui><textEdit/></ui></field>
7019    </subform>
7020  </template>
7021</xdp:xdp>"#;
7022        let original = build_xfa_pdf(xdp);
7023        // Use a minimal clean PDF as the "flattened" output.
7024        let mut doc = Document::with_version("1.4");
7025        let pages_id = doc.new_object_id();
7026        let page_id = doc.add_object(Object::Dictionary(dictionary! {
7027            "Type"     => Object::Name(b"Page".to_vec()),
7028            "Parent"   => Object::Reference(pages_id),
7029            "MediaBox" => Object::Array(vec![
7030                Object::Integer(0), Object::Integer(0),
7031                Object::Integer(612), Object::Integer(792),
7032            ])
7033        }));
7034        doc.objects.insert(
7035            pages_id,
7036            Object::Dictionary(dictionary! {
7037                "Type"  => Object::Name(b"Pages".to_vec()),
7038                "Kids"  => Object::Array(vec![Object::Reference(page_id)]),
7039                "Count" => Object::Integer(1)
7040            }),
7041        );
7042        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
7043            "Type"  => Object::Name(b"Catalog".to_vec()),
7044            "Pages" => Object::Reference(pages_id)
7045        }));
7046        doc.trailer.set("Root", Object::Reference(catalog_id));
7047        let mut flattened = Vec::new();
7048        doc.save_to(&mut flattened).unwrap();
7049
7050        let result = validate_text_completeness(&original, &flattened)
7051            .expect("validate_text_completeness should not fail");
7052        assert!(
7053            result.expected_values.is_empty(),
7054            "no datasets packet means no expected values"
7055        );
7056        assert_eq!(
7057            result.completeness_ratio, 1.0,
7058            "empty expected set should yield ratio 1.0"
7059        );
7060    }
7061
7062    /// validate_text_completeness returns ratio 1.0 on empty inputs (no panic).
7063    #[test]
7064    fn validate_text_completeness_empty_inputs_do_not_panic() {
7065        let result = validate_text_completeness(&[], &[]);
7066        assert!(result.is_ok(), "should return Ok on empty inputs");
7067        let v = result.unwrap();
7068        assert_eq!(v.completeness_ratio, 1.0);
7069        assert!(v.expected_values.is_empty());
7070        assert!(v.missing_values.is_empty());
7071    }
7072
7073    // -----------------------------------------------------------------------
7074    // XFA-F9-03 (#1122): Debug logging — no panic/error on empty/non-XFA input
7075    // -----------------------------------------------------------------------
7076
7077    /// Calling `flatten_xfa_to_pdf` with completely empty input must not panic
7078    /// and must return an Ok (pass-through) or a well-formed Err.
7079    ///
7080    /// This also exercises the logging infrastructure: no log::error! calls
7081    /// should be emitted for inputs that simply have no XFA content.
7082    #[test]
7083    fn flatten_empty_bytes_does_not_panic_and_does_not_error() {
7084        // Empty byte slice: not a PDF, no XFA markers — should return Ok([])
7085        // or at worst a well-formed Err (not a panic).
7086        // We only assert it does not panic; Ok or Err is acceptable.
7087        let _ = flatten_xfa_to_pdf(b"");
7088    }
7089
7090    /// Non-XFA PDF bytes: flatten_xfa_to_pdf must return the input unchanged
7091    /// and must not emit any log errors.
7092    #[test]
7093    fn flatten_non_xfa_bytes_returns_input_unchanged() {
7094        // A trivial byte string that looks vaguely like PDF but has no /AcroForm
7095        // and no xdp:xdp — the pre-check at the start of flatten_xfa_to_pdf
7096        // should return immediately with the original bytes cloned.
7097        let input = b"%PDF-1.4\n%%EOF\n";
7098        // Err is acceptable for degenerate input; a success must pass through.
7099        if let Ok(out) = flatten_xfa_to_pdf(input) {
7100            assert_eq!(out, input, "non-XFA input should pass through unchanged");
7101        }
7102    }
7103}
pdf_xfa/flatten.rs

pdf_xfa/
flatten.rs