pdf_xfa/
extract.rs

1//! XFA packet extraction from PDF via pdf-syntax.
2use crate::error::{Result, XfaError};
3use pdf_syntax::object::dict::keys::{ACRO_FORM, XFA};
4use pdf_syntax::object::{Array, Dict, Object, Stream};
5use pdf_syntax::{Filter, Pdf};
6
7/// Return `true` when a stream's filters indicate that it is an image or other
8/// binary blob that cannot contain XFA XML text.
9///
10/// XFA `<xdp:xdp>` packets are stored as plain XML (optionally `/FlateDecode`
11/// compressed). They are never wrapped with image codecs such as JPEG, JPEG
12/// 2000, JBIG2 or CCITT fax. Detecting these filters before invoking
13/// [`Stream::decoded`] avoids expensive image decoding during the XFA scan
14/// fallback (see PERF2-01: JPEG 2000 decode accounted for ~45% of wall-time
15/// on `edd_DE44.pdf`).
16fn is_image_only_stream(stream: &Stream<'_>) -> bool {
17    stream.filters().iter().any(|f| {
18        matches!(
19            f,
20            Filter::JpxDecode | Filter::DctDecode | Filter::Jbig2Decode | Filter::CcittFaxDecode
21        )
22    })
23}
24/// XfaPackets.
25
26#[derive(Debug, Clone, Default)]
27pub struct XfaPackets {
28    /// full_xml.
29    pub full_xml: Option<String>,
30    /// packets.
31    pub packets: Vec<(String, String)>,
32}
33
34impl XfaPackets {
35    /// get_packet.
36    pub fn get_packet(&self, name: &str) -> Option<&str> {
37        self.packets
38            .iter()
39            .find(|(n, _)| n == name)
40            .map(|(_, v)| v.as_str())
41    }
42    /// template.
43    pub fn template(&self) -> Option<&str> {
44        self.get_packet("template")
45    }
46    /// datasets.
47    pub fn datasets(&self) -> Option<&str> {
48        // When multiple "datasets" packets exist (e.g. from incremental saves),
49        // prefer the largest one — the small/empty one is the original blank form
50        // and the larger one contains the filled data.
51        self.packets
52            .iter()
53            .filter(|(n, _)| n == "datasets")
54            .max_by_key(|(_, v)| v.len())
55            .map(|(_, v)| v.as_str())
56    }
57    /// config.
58    pub fn config(&self) -> Option<&str> {
59        self.get_packet("config")
60    }
61    /// locale_set.
62    pub fn locale_set(&self) -> Option<&str> {
63        self.get_packet("localeSet")
64    }
65}
66/// Outcome of probing the catalog/AcroForm structure for an XFA entry.
67///
68/// Distinguishes "AcroForm is readable but carries no XFA" from "the AcroForm
69/// structure itself was unreadable". This lets [`extract_xfa`] skip the
70/// expensive whole-document [`scan_for_xfa`] fallback when we can prove from
71/// the spec-compliant location (catalog → /AcroForm → /XFA, XFA 3.3 §3) that
72/// no XFA is present.
73///
74/// QF1-A (L-01 long-form hotspot): documents like California EDD `DE 44` carry
75/// `/AcroForm` (widget-based interactive form) without `/XFA`. The byte
76/// pre-check in `flatten_xfa_to_pdf_internal` (looking for `/AcroForm` or
77/// `xdp:xdp`) is permissive on purpose, so it falls through to
78/// `extract_xfa_from_bytes`, which previously had to walk every compressed
79/// object stream looking for `<xdp:xdp` before giving up (~775 ms / ~56 % of
80/// wall time on `edd_DE44.pdf`). With this probe we early-return
81/// [`XfaError::PacketNotFound`] for the pure-AcroForm case, leaving the
82/// `scan_for_xfa` fallback only for genuinely broken catalog/AcroForm
83/// structures.
84#[derive(Debug)]
85enum AcroformProbe {
86    /// AcroForm exists, has /XFA, and packets were successfully extracted.
87    XfaFound(XfaPackets),
88    /// Catalog read OK; no AcroForm dictionary in catalog. Per XFA 3.3 §3 the
89    /// XFA packets live inside the AcroForm dictionary, so no AcroForm =
90    /// no XFA. (A document MAY in principle carry an orphaned XFA stream
91    /// outside the spec layout; that pathology still hits the fallback via
92    /// [`AcroformProbe::Unreadable`] when the catalog itself fails.)
93    NoAcroform,
94    /// AcroForm exists and is readable, but the dictionary has no /XFA entry.
95    /// Spec-compliant pure-AcroForm form — `scan_for_xfa` would just burn
96    /// time walking every object stream and return PacketNotFound anyway.
97    AcroformNoXfa,
98    /// The catalog or AcroForm dictionary could not be read at all (truncated
99    /// PDF, repaired xref still broken, etc.). Fall through to the
100    /// best-effort whole-document scan.
101    Unreadable,
102}
103
104/// extract_xfa.
105pub fn extract_xfa(pdf: &Pdf) -> Result<XfaPackets> {
106    match probe_acroform_for_xfa(pdf) {
107        AcroformProbe::XfaFound(mut p) => {
108            // If the datasets packet is empty/tiny (common with incremental saves
109            // where Adobe Reader writes a new datasets object but doesn't update
110            // the XFA array reference), scan all objects for a larger one.
111            let current_ds_len = p.datasets().map(|s| s.len()).unwrap_or(0);
112            if current_ds_len < 200 {
113                if let Some(better_ds) = scan_for_datasets(pdf, current_ds_len) {
114                    p.packets.push(("datasets".to_string(), better_ds));
115                }
116            }
117            Ok(p)
118        }
119        AcroformProbe::NoAcroform | AcroformProbe::AcroformNoXfa => {
120            // QF1-A: skip the expensive whole-document object scan. Per XFA 3.3 §3
121            // the XFA packets live inside the AcroForm dictionary, so a
122            // spec-compliant document without /AcroForm /XFA cannot carry XFA.
123            // The caller (`flatten_xfa_to_pdf_internal`) will route to
124            // `static_fallback`, preserving correctness while eliminating
125            // ~775 ms of wasted scan time on pure-AcroForm long-form docs like
126            // `edd_DE44.pdf`.
127            Err(XfaError::PacketNotFound("no XFA content found".to_string()))
128        }
129        AcroformProbe::Unreadable => {
130            // Best-effort fallback: the catalog/AcroForm structure could not be
131            // read, but the byte-level pre-check found XFA-like markers. Scan
132            // every non-image stream for a `<xdp:xdp` packet.
133            scan_for_xfa(pdf)
134        }
135    }
136}
137
138/// Probe the catalog/AcroForm structure for the presence of `/XFA`.
139///
140/// Returns the strongest outcome we can determine from a small number of
141/// xref hits, so [`extract_xfa`] can avoid the expensive whole-document
142/// fallback when we have positive evidence that no XFA can be present.
143fn probe_acroform_for_xfa(pdf: &Pdf) -> AcroformProbe {
144    let xref = pdf.xref();
145    let Some(catalog): Option<Dict<'_>> = xref.get(xref.root_id()) else {
146        return AcroformProbe::Unreadable;
147    };
148    let Some(acroform): Option<Dict<'_>> = catalog.get(ACRO_FORM) else {
149        return AcroformProbe::NoAcroform;
150    };
151    if let Some(stream) = acroform.get::<Stream<'_>>(XFA) {
152        if let Some(decoded) = decode_stream(&stream) {
153            return AcroformProbe::XfaFound(parse_xfa_xml(&decoded));
154        }
155        // /XFA stream object existed but could not be decoded — treat as
156        // unreadable so the scan fallback gets a chance.
157        return AcroformProbe::Unreadable;
158    }
159    if let Some(array) = acroform.get::<Array<'_>>(XFA) {
160        return AcroformProbe::XfaFound(extract_from_array(&array));
161    }
162    AcroformProbe::AcroformNoXfa
163}
164
165/// Scan all PDF stream objects for a datasets packet larger than `min_len`.
166/// Returns the largest found, if any.
167fn scan_for_datasets(pdf: &Pdf, min_len: usize) -> Option<String> {
168    let mut best: Option<String> = None;
169    for obj in pdf.objects() {
170        if let Object::Stream(s) = obj {
171            // Skip image streams — they never contain XFA datasets XML.
172            if is_image_only_stream(&s) {
173                continue;
174            }
175            if let Some(d) = decode_stream(&s) {
176                if d.len() > min_len
177                    && d.contains("<xfa:datasets")
178                    && best.as_ref().is_none_or(|b| d.len() > b.len())
179                {
180                    best = Some(d);
181                }
182            }
183        }
184    }
185    best
186}
187/// extract_xfa_from_bytes.
188pub fn extract_xfa_from_bytes(data: impl Into<pdf_syntax::PdfData>) -> Result<XfaPackets> {
189    let pdf = Pdf::new(data).map_err(|e| XfaError::LoadFailed(format!("{e:?}")))?;
190    extract_xfa(&pdf)
191}
192/// extract_xfa_from_acroform.
193pub fn extract_xfa_from_acroform(pdf: &Pdf) -> Option<XfaPackets> {
194    let xref = pdf.xref();
195    let catalog: Dict<'_> = xref.get(xref.root_id())?;
196    let acroform: Dict<'_> = catalog.get(ACRO_FORM)?;
197    if let Some(stream) = acroform.get::<Stream<'_>>(XFA) {
198        return Some(parse_xfa_xml(&decode_stream(&stream)?));
199    }
200    if let Some(array) = acroform.get::<Array<'_>>(XFA) {
201        return Some(extract_from_array(&array));
202    }
203    None
204}
205
206fn extract_from_array(array: &Array<'_>) -> XfaPackets {
207    let mut packets = XfaPackets::default();
208    let items: Vec<Object<'_>> = array.iter::<Object<'_>>().collect();
209    let mut i = 0;
210    while i + 1 < items.len() {
211        let name = match &items[i] {
212            Object::String(s) => std::string::String::from_utf8_lossy(s.as_bytes()).to_string(),
213            Object::Name(n) => std::string::String::from_utf8_lossy(n.as_ref()).to_string(),
214            _ => {
215                i += 1;
216                continue;
217            }
218        };
219        if let Some(c) = match &items[i + 1] {
220            Object::Stream(s) => decode_stream(s),
221            Object::String(s) => {
222                Some(std::string::String::from_utf8_lossy(s.as_bytes()).to_string())
223            }
224            _ => None,
225        } {
226            packets.packets.push((name, c));
227        }
228        i += 2;
229    }
230    packets
231}
232
233fn scan_for_xfa(pdf: &Pdf) -> Result<XfaPackets> {
234    // Cap the number of streams we decompress to avoid multi-second stalls on
235    // large non-XFA PDFs. XFA XDP streams are typically among the first few
236    // hundred objects. If we haven't found one after 2000 streams, give up.
237    //
238    // PERF (PERF2-02): skip image streams (`/JPXDecode`, `/DCTDecode`,
239    // `/JBIG2Decode`, `/CCITTFaxDecode`) entirely. XFA packets are XML text
240    // wrapped at most by `/FlateDecode`; they never use image codecs. Decoding
241    // a JPEG 2000 image costs hundreds of milliseconds and dominated the
242    // wall-time on `edd_DE44.pdf` (PERF2-01 hotspot report).
243    let mut streams_checked = 0u32;
244    for obj in pdf.objects() {
245        if let Object::Stream(s) = obj {
246            if is_image_only_stream(&s) {
247                continue;
248            }
249            streams_checked += 1;
250            if streams_checked > 2000 {
251                break;
252            }
253            if let Some(d) = decode_stream(&s) {
254                if d.contains("<xdp:xdp") {
255                    return Ok(parse_xfa_xml(&d));
256                }
257            }
258        }
259    }
260    Err(XfaError::PacketNotFound("no XFA content found".to_string()))
261}
262
263fn decode_stream(stream: &Stream<'_>) -> Option<String> {
264    std::string::String::from_utf8(stream.decoded().ok()?).ok()
265}
266
267fn parse_xfa_xml(xml: &str) -> XfaPackets {
268    let mut packets = XfaPackets {
269        full_xml: Some(xml.to_string()),
270        packets: Vec::new(),
271    };
272    let t = xml.trim();
273    let c = t.find("?>").map(|p| &t[p + 2..]).unwrap_or(t).trim();
274    let inner = match c.find('>') {
275        Some(s) => {
276            let rest = &c[s + 1..];
277            rest.rfind("</xdp:xdp>")
278                .map(|e| &rest[..e])
279                .or_else(|| rest.rfind("</xdp>").map(|e| &rest[..e]))
280                .unwrap_or(rest)
281        }
282        None => return packets,
283    };
284    let mut pos = 0;
285    let bytes = inner.as_bytes();
286    while pos < bytes.len() {
287        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
288            pos += 1;
289        }
290        if pos >= bytes.len() {
291            break;
292        }
293        if bytes[pos] != b'<' {
294            pos += 1;
295            continue;
296        }
297        if inner[pos..].starts_with("<!--") {
298            if let Some(e) = inner[pos..].find("-->") {
299                pos += e + 3;
300                continue;
301            }
302        }
303        if inner[pos..].starts_with("<?") {
304            if let Some(e) = inner[pos..].find("?>") {
305                pos += e + 2;
306                continue;
307            }
308        }
309        let ts = pos;
310        pos += 1;
311        let ns = pos;
312        while pos < bytes.len() && bytes[pos] != b'>' && bytes[pos] != b' ' && bytes[pos] != b'/' {
313            pos += 1;
314        }
315        let ft = &inner[ns..pos];
316        let pn = ft.split(':').next_back().unwrap_or(ft);
317        let ct = format!("</{ft}>");
318        let at = format!("</xfa:{pn}>");
319        if let Some(cp) = inner[ts..].find(ct.as_str()) {
320            let ee = ts + cp + ct.len();
321            packets
322                .packets
323                .push((pn.to_string(), inner[ts..ee].to_string()));
324            pos = ee;
325        } else if let Some(cp) = inner[ts..].find(at.as_str()) {
326            let ee = ts + cp + at.len();
327            packets
328                .packets
329                .push((pn.to_string(), inner[ts..ee].to_string()));
330            pos = ee;
331        } else {
332            while pos < bytes.len() && bytes[pos] != b'>' {
333                pos += 1;
334            }
335            pos += 1;
336        }
337    }
338    packets
339}
340
341// ─── Packet validation ───────────────────────────────────────────────────────
342
343/// Summary of what was found (or missing) in a set of [`XfaPackets`].
344///
345/// Returned by [`validate_xfa_packets`].  Intended for diagnostics, logging,
346/// and deciding how to handle unusual or incomplete XFA documents.
347#[derive(Debug, Clone, Default)]
348pub struct PacketValidation {
349    /// `true` when a `template` packet is present.
350    pub has_template: bool,
351    /// `true` when at least one `datasets` packet is present.
352    pub has_datasets: bool,
353    /// `true` when a `config` packet is present.
354    pub has_config: bool,
355    /// Byte length of the template packet (0 if absent).
356    pub template_bytes: usize,
357    /// Byte length of the largest datasets packet (0 if absent).
358    pub datasets_bytes: usize,
359    /// Names of all packets in document order.
360    pub packet_names: Vec<String>,
361    /// Human-readable warnings about missing or suspicious content.
362    pub warnings: Vec<String>,
363}
364
365/// Validate the contents of [`XfaPackets`] and return a [`PacketValidation`].
366///
367/// This function never panics and never fails — it always returns a result,
368/// even for empty or degenerate packet sets.
369pub fn validate_xfa_packets(packets: &XfaPackets) -> PacketValidation {
370    let has_template = packets.template().is_some();
371    let has_datasets = packets.datasets().is_some();
372    let has_config = packets.config().is_some();
373
374    let template_bytes = packets.template().map(|s| s.len()).unwrap_or(0);
375    let datasets_bytes = packets.datasets().map(|s| s.len()).unwrap_or(0);
376    let packet_names = packets.packets.iter().map(|(n, _)| n.clone()).collect();
377
378    let mut warnings = Vec::new();
379
380    if !has_template {
381        warnings.push("No template packet found".to_string());
382    } else if template_bytes < 100 {
383        warnings.push(format!(
384            "Template packet is empty (< 100 bytes) — only {template_bytes} bytes"
385        ));
386    }
387
388    if !has_datasets {
389        warnings.push("No datasets packet".to_string());
390    } else if datasets_bytes < 50 {
391        warnings.push(format!(
392            "Datasets packet is suspiciously small (< 50 bytes) — only {datasets_bytes} bytes"
393        ));
394    }
395
396    PacketValidation {
397        has_template,
398        has_datasets,
399        has_config,
400        template_bytes,
401        datasets_bytes,
402        packet_names,
403        warnings,
404    }
405}
406// ─── Embedded font extraction ────────────────────────────────────────────────
407
408/// Extract embedded font programs from a PDF parsed with pdf-syntax.
409///
410/// Returns a list of `(name, raw_font_bytes)` pairs for every `FontDescriptor`
411/// object that carries a `FontFile`, `FontFile2`, or `FontFile3` stream.
412///
413/// # Relationship to the flatten pipeline
414///
415/// The flattening pipeline (`crate::flatten`) uses a separate lopdf-based
416/// variant (internal, `#[doc(hidden)]`) that additionally captures `/Widths`
417/// arrays and encoding metadata needed for text measurement. That variant
418/// returns [`crate::font_bridge::EmbeddedFontData`] structs and is not part
419/// of the public extraction API.
420///
421/// Use this function when you only need the raw font bytes for inspection,
422/// subsetting, or external embedding outside the flatten pipeline.
423pub fn extract_embedded_fonts(pdf: &Pdf) -> Vec<(String, Vec<u8>)> {
424    use pdf_syntax::object::dict::keys::{FONT_FILE, FONT_FILE2, FONT_FILE3, FONT_NAME, TYPE};
425    use pdf_syntax::object::Name;
426    let mut fonts = Vec::new();
427    for obj in pdf.objects() {
428        let dict = match &obj {
429            Object::Dict(d) => d.clone(),
430            Object::Stream(s) => s.dict().clone(),
431            _ => continue,
432        };
433        if dict
434            .get::<Name>(TYPE)
435            .is_none_or(|n| n.as_ref() != b"FontDescriptor")
436        {
437            continue;
438        }
439        let name = dict
440            .get::<Name>(FONT_NAME)
441            .map(|n| std::string::String::from_utf8_lossy(n.as_ref()).to_string())
442            .unwrap_or_default();
443        for key in [FONT_FILE2, FONT_FILE, FONT_FILE3] {
444            if let Some(s) = dict.get::<Stream<'_>>(key) {
445                if let Ok(d) = s.decoded() {
446                    if !d.is_empty() {
447                        fonts.push((name.clone(), d));
448                        break;
449                    }
450                }
451            }
452        }
453    }
454    fonts
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460    #[test]
461    fn parse_xfa_packets() {
462        let xml = r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="f1"><field name="T1"/></subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><f1><T1>Hi</T1></f1></xfa:data></xfa:datasets></xdp:xdp>"#;
463        let p = parse_xfa_xml(xml);
464        assert_eq!(p.packets.len(), 2);
465        assert!(p.template().is_some());
466        assert!(p.datasets().is_some());
467    }
468    #[test]
469    fn empty_xfa() {
470        let p = parse_xfa_xml(r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#);
471        assert_eq!(p.packets.len(), 0);
472    }
473
474    #[test]
475    fn get_packet_missing_returns_none() {
476        let p = parse_xfa_xml(r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#);
477        assert!(p.get_packet("template").is_none());
478        assert!(p.get_packet("nonexistent").is_none());
479        assert!(p.config().is_none());
480        assert!(p.locale_set().is_none());
481    }
482
483    #[test]
484    fn full_xml_preserved() {
485        // full_xml should always capture the entire input string.
486        let xml =
487            r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#;
488        let p = parse_xfa_xml(xml);
489        let stored = p.full_xml.as_deref().unwrap_or("");
490        assert!(stored.contains("xdp:xdp"));
491    }
492
493    #[test]
494    fn config_packet_parsed() {
495        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present><xdp><packets>*</packets></xdp></present></config></xdp:xdp>"#;
496        let p = parse_xfa_xml(xml);
497        assert_eq!(p.packets.len(), 1);
498        assert!(p.config().is_some());
499        assert!(p.template().is_none());
500    }
501
502    #[test]
503    fn multiple_packets_order_preserved() {
504        // template must come before datasets — order matches the XDP source order.
505        let xml = r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"/></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data/></xfa:datasets></xdp:xdp>"#;
506        let p = parse_xfa_xml(xml);
507        assert_eq!(p.packets.len(), 2);
508        assert_eq!(p.packets[0].0, "template");
509        assert_eq!(p.packets[1].0, "datasets");
510        assert!(p.template().is_some());
511        assert!(p.datasets().is_some());
512    }
513
514    // ── PacketValidation tests (issue #1085) ──────────────────────────────
515
516    #[test]
517    fn validate_complete_packets_no_warnings() {
518        let xml = r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="firstName" xmlns:ui="http://www.xfa.org/schema/xfa-template/3.3/"><ui><textEdit/></ui></field></subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><root><firstName>Alice</firstName></root></xfa:data></xfa:datasets></xdp:xdp>"#;
519        let p = parse_xfa_xml(xml);
520        let v = validate_xfa_packets(&p);
521        assert!(v.has_template);
522        assert!(v.has_datasets);
523        assert!(v.template_bytes > 0);
524        assert!(v.datasets_bytes > 0);
525        assert!(
526            v.warnings.is_empty(),
527            "expected no warnings, got: {:?}",
528            v.warnings
529        );
530    }
531
532    #[test]
533    fn validate_missing_template_produces_warning() {
534        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data/></xfa:datasets></xdp:xdp>"#;
535        let p = parse_xfa_xml(xml);
536        let v = validate_xfa_packets(&p);
537        assert!(!v.has_template);
538        assert!(v
539            .warnings
540            .iter()
541            .any(|w| w.contains("No template packet found")));
542    }
543
544    #[test]
545    fn validate_missing_datasets_produces_warning() {
546        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="x"/><field name="y"/><field name="z"/><field name="w"/></subform></template></xdp:xdp>"#;
547        let p = parse_xfa_xml(xml);
548        let v = validate_xfa_packets(&p);
549        assert!(!v.has_datasets);
550        assert!(v.warnings.iter().any(|w| w.contains("No datasets packet")));
551    }
552
553    #[test]
554    fn validate_tiny_template_produces_warning() {
555        let mut p = XfaPackets::default();
556        p.packets.push(("template".to_string(), "<t/>".to_string()));
557        p.packets.push((
558            "datasets".to_string(),
559            "<xfa:datasets xmlns:xfa=\"http://www.xfa.org/schema/xfa-data/1.0/\"><xfa:data/></xfa:datasets>".to_string(),
560        ));
561        let v = validate_xfa_packets(&p);
562        assert!(v.warnings.iter().any(|w| w.contains("< 100 bytes")));
563    }
564
565    #[test]
566    fn validate_tiny_datasets_produces_warning() {
567        let mut p = XfaPackets::default();
568        // Give a substantial template so that warning comes from datasets only.
569        p.packets.push((
570            "template".to_string(),
571            "<template xmlns=\"http://www.xfa.org/schema/xfa-template/3.3/\"><subform name=\"root\"><field name=\"a\"/><field name=\"b\"/><field name=\"c\"/></subform></template>".to_string(),
572        ));
573        p.packets
574            .push(("datasets".to_string(), "<ds/>".to_string()));
575        let v = validate_xfa_packets(&p);
576        assert!(v.warnings.iter().any(|w| w.contains("< 50 bytes")));
577    }
578
579    #[test]
580    fn validate_packet_names_list() {
581        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present/></config><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="f1"/><field name="f2"/><field name="f3"/></subform></template></xdp:xdp>"#;
582        let p = parse_xfa_xml(xml);
583        let v = validate_xfa_packets(&p);
584        assert!(v.packet_names.contains(&"config".to_string()));
585        assert!(v.packet_names.contains(&"template".to_string()));
586        assert!(v.has_config);
587    }
588
589    // ── XFA corpus tests (issue #1086) ────────────────────────────────────
590    // Ten synthetic tests covering representative XFA document patterns.
591    // Each test uses small in-memory XML strings — no real PDFs required.
592
593    /// 1. Static XFA form detection via baseProfile.
594    #[test]
595    fn corpus_01_static_xfa_form_detection() {
596        use crate::classify::{detect_xfa_type_from_packets, XfaType};
597        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/" baseProfile="interactiveForms"><subform name="Page1"><field name="LastName"/><field name="FirstName"/></subform></template></xdp:xdp>"#;
598        let p = parse_xfa_xml(xml);
599        assert_eq!(detect_xfa_type_from_packets(&p), XfaType::Static);
600    }
601
602    /// 2. Dynamic XFA form detection (no baseProfile constraint).
603    #[test]
604    fn corpus_02_dynamic_xfa_form_detection() {
605        use crate::classify::{detect_xfa_type_from_packets, XfaType};
606        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><occur min="0" max="-1"/><field name="item"/></subform></template></xdp:xdp>"#;
607        let p = parse_xfa_xml(xml);
608        assert_eq!(detect_xfa_type_from_packets(&p), XfaType::Dynamic);
609    }
610
611    /// 3. XFA with multiple packets (template + datasets + config).
612    #[test]
613    fn corpus_03_multiple_packets() {
614        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present><xdp><packets>*</packets></xdp></present></config><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"/></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data/></xfa:datasets></xdp:xdp>"#;
615        let p = parse_xfa_xml(xml);
616        assert_eq!(p.packets.len(), 3, "should have config, template, datasets");
617        assert!(p.config().is_some());
618        assert!(p.template().is_some());
619        assert!(p.datasets().is_some());
620    }
621
622    /// 4. XFA with no datasets (template-only — blank form, no data bound).
623    #[test]
624    fn corpus_04_template_only_no_datasets() {
625        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="LastName"/><field name="FirstName"/><field name="DOB"/></subform></template></xdp:xdp>"#;
626        let p = parse_xfa_xml(xml);
627        assert!(p.template().is_some());
628        assert!(p.datasets().is_none());
629        let v = validate_xfa_packets(&p);
630        assert!(!v.has_datasets);
631        assert!(v.warnings.iter().any(|w| w.contains("No datasets")));
632    }
633
634    /// 5. XFA with binary-like image data embedded in datasets (base64 blob).
635    #[test]
636    fn corpus_05_xfa_with_image_data_in_datasets() {
637        // Simulate datasets containing a base64-encoded image field.
638        let b64_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
639        let xml = format!(
640            r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="photo"><ui><imageEdit/></ui></field></subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><root><photo contentType="image/png" href="">{b64_image}</photo></root></xfa:data></xfa:datasets></xdp:xdp>"#
641        );
642        let p = parse_xfa_xml(&xml);
643        assert!(p.template().is_some());
644        assert!(p.datasets().is_some());
645        let ds = p.datasets().unwrap();
646        assert!(ds.contains(b64_image), "datasets should contain image data");
647    }
648
649    /// 6. Non-XFA PDF (empty bytes) returns XfaType::None.
650    #[test]
651    fn corpus_06_non_xfa_pdf_returns_none() {
652        use crate::classify::{detect_xfa_type, XfaType};
653        // A plain PDF header with no AcroForm/XFA.
654        let not_xfa: &[u8] = b"%PDF-1.4\n%%EOF";
655        assert_eq!(detect_xfa_type(not_xfa), XfaType::None);
656    }
657
658    /// 7. XFA with config packet — config is correctly parsed and accessible.
659    #[test]
660    fn corpus_07_xfa_with_config_packet() {
661        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present><xdp><packets>*</packets></xdp></present><pdf><version>1.6</version></pdf></config><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"/></template></xdp:xdp>"#;
662        let p = parse_xfa_xml(xml);
663        assert!(p.config().is_some());
664        let cfg = p.config().unwrap();
665        assert!(cfg.contains("packets"));
666        let v = validate_xfa_packets(&p);
667        assert!(v.has_config);
668    }
669
670    /// 8. Empty datasets packet (incremental save pattern — original blank form).
671    #[test]
672    fn corpus_08_empty_datasets_incremental_save_pattern() {
673        // Two datasets entries: original blank (small) and filled (larger).
674        let mut p = XfaPackets::default();
675        p.packets.push((
676            "template".to_string(),
677            "<template xmlns=\"http://www.xfa.org/schema/xfa-template/3.3/\"><subform name=\"root\"><field name=\"qty\"/><field name=\"price\"/><field name=\"total\"/></subform></template>".to_string(),
678        ));
679        // Blank (incremental save artefact — very small):
680        p.packets.push((
681            "datasets".to_string(),
682            "<xfa:datasets xmlns:xfa=\"http://www.xfa.org/schema/xfa-data/1.0/\"/>".to_string(),
683        ));
684        // Filled (the real data):
685        p.packets.push(("datasets".to_string(), "<xfa:datasets xmlns:xfa=\"http://www.xfa.org/schema/xfa-data/1.0/\"><xfa:data><root><qty>3</qty><price>9.99</price><total>29.97</total></root></xfa:data></xfa:datasets>".to_string()));
686        // datasets() must return the LARGEST entry.
687        let ds = p.datasets().expect("datasets should exist");
688        assert!(
689            ds.contains("29.97"),
690            "should return the larger/filled datasets"
691        );
692    }
693
694    /// 9. Large template with many fields — validation should have no warnings.
695    #[test]
696    fn corpus_09_large_template_many_fields() {
697        // Build a template with 20 fields to ensure validation handles size correctly.
698        let fields: String = (1..=20)
699            .map(|i| format!("<field name=\"field{i}\"><ui><textEdit/></ui></field>"))
700            .collect();
701        let xml = format!(
702            r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root">{fields}</subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><root>{}</root></xfa:data></xfa:datasets></xdp:xdp>"#,
703            (1..=20)
704                .map(|i| format!("<field{i}>val{i}</field{i}>"))
705                .collect::<String>()
706        );
707        let p = parse_xfa_xml(&xml);
708        let v = validate_xfa_packets(&p);
709        assert!(v.has_template);
710        assert!(v.has_datasets);
711        assert!(
712            v.template_bytes >= 100,
713            "large template should exceed 100 bytes"
714        );
715        assert!(
716            v.warnings.is_empty(),
717            "no warnings expected: {:?}",
718            v.warnings
719        );
720    }
721
722    /// 10. XFA with localeSet packet — localeSet is correctly accessible.
723    #[test]
724    fn corpus_10_xfa_with_locale_set_packet() {
725        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><localeSet xmlns="http://www.xfa.org/schema/xfa-locale-set/2.7/"><locale name="en_US" desc="English (United States)"><calendarSymbols name="gregorian"/></locale></localeSet><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="date"/></subform></template></xdp:xdp>"#;
726        let p = parse_xfa_xml(xml);
727        assert!(
728            p.locale_set().is_some(),
729            "localeSet packet should be accessible"
730        );
731        assert!(p.template().is_some());
732        let ls = p.locale_set().unwrap();
733        assert!(ls.contains("en_US"));
734    }
735}
pdf_xfa/extract.rs

pdf_xfa/
extract.rs