Skip to main content

pdf_xfa/
extract.rs

1//! XFA packet extraction from PDF via pdf-syntax.
2use crate::error::{Result, XfaError};
3use pdf_syntax::object::dict::keys::{ACRO_FORM, XFA};
4use pdf_syntax::object::{Array, Dict, Object, Stream};
5use pdf_syntax::Pdf;
6/// XfaPackets.
7
8#[derive(Debug, Clone, Default)]
9pub struct XfaPackets {
10    /// full_xml.
11    pub full_xml: Option<String>,
12    /// packets.
13    pub packets: Vec<(String, String)>,
14}
15
16impl XfaPackets {
17    /// get_packet.
18    pub fn get_packet(&self, name: &str) -> Option<&str> {
19        self.packets
20            .iter()
21            .find(|(n, _)| n == name)
22            .map(|(_, v)| v.as_str())
23    }
24    /// template.
25    pub fn template(&self) -> Option<&str> {
26        self.get_packet("template")
27    }
28    /// datasets.
29    pub fn datasets(&self) -> Option<&str> {
30        // When multiple "datasets" packets exist (e.g. from incremental saves),
31        // prefer the largest one — the small/empty one is the original blank form
32        // and the larger one contains the filled data.
33        self.packets
34            .iter()
35            .filter(|(n, _)| n == "datasets")
36            .max_by_key(|(_, v)| v.len())
37            .map(|(_, v)| v.as_str())
38    }
39    /// config.
40    pub fn config(&self) -> Option<&str> {
41        self.get_packet("config")
42    }
43    /// locale_set.
44    pub fn locale_set(&self) -> Option<&str> {
45        self.get_packet("localeSet")
46    }
47}
48/// extract_xfa.
49pub fn extract_xfa(pdf: &Pdf) -> Result<XfaPackets> {
50    if let Some(mut p) = extract_xfa_from_acroform(pdf) {
51        if !p.packets.is_empty() || p.full_xml.is_some() {
52            // If the datasets packet is empty/tiny (common with incremental saves
53            // where Adobe Reader writes a new datasets object but doesn't update
54            // the XFA array reference), scan all objects for a larger one.
55            let current_ds_len = p.datasets().map(|s| s.len()).unwrap_or(0);
56            if current_ds_len < 200 {
57                if let Some(better_ds) = scan_for_datasets(pdf, current_ds_len) {
58                    p.packets.push(("datasets".to_string(), better_ds));
59                }
60            }
61            return Ok(p);
62        }
63    }
64    scan_for_xfa(pdf)
65}
66
67/// Scan all PDF stream objects for a datasets packet larger than `min_len`.
68/// Returns the largest found, if any.
69///
70/// A stream qualifies as a datasets packet only when its **top-level** XML
71/// element is `<xfa:datasets …>` — i.e. after optional UTF-8 BOM,
72/// whitespace, XML declaration(s), and comments, the next characters are
73/// `<xfa:datasets` followed by `/`, `>`, or ASCII whitespace.
74///
75/// Why the structural check matters (Tier A wave 2 fix): the wider
76/// `contains("<xfa:datasets")` test mis-classified single-stream XDPs —
77/// where the entire `<xdp:xdp>{template, datasets, …}</xdp:xdp>` lives in
78/// one `/XFA` stream — as datasets packets, because such streams contain
79/// `<xfa:datasets>` as a descendant. The misclassified stream then won
80/// the "largest wins" tie-break in [`XfaPackets::datasets`], the DataDom
81/// was built over the entire XDP, and template attributes leaked into
82/// the data namespace. The new check still accepts the incremental-save
83/// artefact this scan was designed to catch (those streams begin with
84/// `<xfa:datasets` directly) while rejecting whole-XDP candidates.
85fn scan_for_datasets(pdf: &Pdf, min_len: usize) -> Option<String> {
86    let mut best: Option<String> = None;
87    for obj in pdf.objects() {
88        if let Object::Stream(s) = obj {
89            if let Some(d) = decode_stream(&s) {
90                if d.len() > min_len
91                    && looks_like_datasets_packet(&d)
92                    && best.as_ref().is_none_or(|b| d.len() > b.len())
93                {
94                    best = Some(d);
95                }
96            }
97        }
98    }
99    best
100}
101
102/// Collect xmlns declarations from a wrapper element's opening tag.
103///
104/// Input is the substring between `<` and `>` of the wrapper element
105/// (typically `<xdp:xdp …>`). Returns a list of `xmlns[:prefix]="uri"`
106/// snippets ready to be re-injected verbatim into a child element's
107/// opening tag.
108fn extract_xmlns_decls(opener: &str) -> Vec<String> {
109    let mut out = Vec::new();
110    let bytes = opener.as_bytes();
111    let mut i = 0;
112    while i < bytes.len() {
113        // Find the next 'xmlns' token preceded by whitespace or tag start.
114        let next = match opener[i..].find("xmlns") {
115            Some(off) => i + off,
116            None => break,
117        };
118        let is_token_start = next == 0
119            || matches!(
120                bytes.get(next - 1),
121                Some(b' ' | b'\t' | b'\r' | b'\n' | b'/')
122            );
123        if !is_token_start {
124            i = next + 1;
125            continue;
126        }
127        // Walk to the '=' that introduces the value.
128        let after = next + "xmlns".len();
129        let mut j = after;
130        if j < bytes.len() && bytes[j] == b':' {
131            j += 1;
132            while j < bytes.len() && !matches!(bytes[j], b'=' | b' ' | b'\t' | b'\r' | b'\n') {
133                j += 1;
134            }
135        }
136        // Skip whitespace before '='.
137        while j < bytes.len() && matches!(bytes[j], b' ' | b'\t' | b'\r' | b'\n') {
138            j += 1;
139        }
140        if j >= bytes.len() || bytes[j] != b'=' {
141            i = next + 1;
142            continue;
143        }
144        j += 1;
145        while j < bytes.len() && matches!(bytes[j], b' ' | b'\t' | b'\r' | b'\n') {
146            j += 1;
147        }
148        let quote = match bytes.get(j) {
149            Some(&q @ (b'"' | b'\'')) => q,
150            _ => {
151                i = next + 1;
152                continue;
153            }
154        };
155        j += 1;
156        while j < bytes.len() && bytes[j] != quote {
157            j += 1;
158        }
159        if j >= bytes.len() {
160            break;
161        }
162        let value_end = j; // exclusive
163        j += 1; // skip closing quote
164        out.push(opener[next..value_end + 1].to_string());
165        i = j;
166    }
167    out
168}
169
170/// Re-inject namespace declarations from the wrapper element into a
171/// freshly-extracted packet so it parses as a standalone XML fragment.
172///
173/// `raw` is the literal substring `<prefix:Name …>…</prefix:Name>`
174/// taken out of the `<xdp:xdp>` wrapper. The wrapper's namespace
175/// declarations are not part of `raw`, so re-parsing `raw` standalone
176/// fails on `xmlns:prefix` lookups. We add any wrapper declaration
177/// whose prefix isn't already declared on the packet's opening tag.
178fn inject_missing_xmlns(raw: &str, wrapper_xmlns: &[String]) -> String {
179    if wrapper_xmlns.is_empty() {
180        return raw.to_string();
181    }
182    let open_end = match raw.find('>') {
183        Some(p) => p,
184        None => return raw.to_string(),
185    };
186    let opener = &raw[..open_end];
187    let mut additions = String::new();
188    for decl in wrapper_xmlns {
189        let prefix_key = decl.split('=').next().unwrap_or("").trim_end().to_string();
190        if prefix_key.is_empty() {
191            continue;
192        }
193        // Already declared on the packet's opening tag?
194        let already = opener.split_whitespace().any(|tok| {
195            tok.trim_end_matches('/')
196                .starts_with(&format!("{}=", prefix_key))
197        });
198        if !already {
199            additions.push(' ');
200            additions.push_str(decl);
201        }
202    }
203    if additions.is_empty() {
204        return raw.to_string();
205    }
206    // Insert additions right before the '>' of the opening tag,
207    // accounting for self-closing '/>'.
208    let insert_at = if opener.trim_end().ends_with('/') {
209        opener.trim_end().len() - 1
210    } else {
211        open_end
212    };
213    let mut out = String::with_capacity(raw.len() + additions.len());
214    out.push_str(&raw[..insert_at]);
215    out.push_str(&additions);
216    out.push_str(&raw[insert_at..]);
217    out
218}
219
220/// Returns `true` when `s` is structurally a standalone XFA datasets
221/// packet — top-level element `<xfa:datasets …>` after optional UTF-8
222/// BOM, leading whitespace, XML declaration(s), and comments. Used by
223/// [`scan_for_datasets`] to avoid mistaking a whole-XDP stream (which
224/// only *contains* `<xfa:datasets>` as a descendant) for a datasets
225/// packet.
226fn looks_like_datasets_packet(s: &str) -> bool {
227    let mut tail = s.trim_start_matches('\u{FEFF}').trim_start();
228    loop {
229        if let Some(rest) = tail.strip_prefix("<?") {
230            match rest.find("?>") {
231                Some(end) => tail = rest[end + 2..].trim_start(),
232                None => return false,
233            }
234        } else if let Some(rest) = tail.strip_prefix("<!--") {
235            match rest.find("-->") {
236                Some(end) => tail = rest[end + 3..].trim_start(),
237                None => return false,
238            }
239        } else {
240            break;
241        }
242    }
243    let rest = match tail.strip_prefix("<xfa:datasets") {
244        Some(r) => r,
245        None => return false,
246    };
247    matches!(
248        rest.as_bytes().first(),
249        Some(b' ' | b'\t' | b'\r' | b'\n' | b'/' | b'>')
250    )
251}
252/// extract_xfa_from_bytes.
253pub fn extract_xfa_from_bytes(data: impl Into<pdf_syntax::PdfData>) -> Result<XfaPackets> {
254    let pdf = Pdf::new(data).map_err(|e| XfaError::LoadFailed(format!("{e:?}")))?;
255    extract_xfa(&pdf)
256}
257/// extract_xfa_from_acroform.
258pub fn extract_xfa_from_acroform(pdf: &Pdf) -> Option<XfaPackets> {
259    let xref = pdf.xref();
260    let catalog: Dict<'_> = xref.get(xref.root_id())?;
261    let acroform: Dict<'_> = catalog.get(ACRO_FORM)?;
262    if let Some(stream) = acroform.get::<Stream<'_>>(XFA) {
263        return Some(parse_xfa_xml(&decode_stream(&stream)?));
264    }
265    if let Some(array) = acroform.get::<Array<'_>>(XFA) {
266        return Some(extract_from_array(&array));
267    }
268    None
269}
270
271fn extract_from_array(array: &Array<'_>) -> XfaPackets {
272    let mut packets = XfaPackets::default();
273    let items: Vec<Object<'_>> = array.iter::<Object<'_>>().collect();
274    let mut i = 0;
275    while i + 1 < items.len() {
276        let name = match &items[i] {
277            Object::String(s) => std::string::String::from_utf8_lossy(s.as_bytes()).to_string(),
278            Object::Name(n) => std::string::String::from_utf8_lossy(n.as_ref()).to_string(),
279            _ => {
280                i += 1;
281                continue;
282            }
283        };
284        if let Some(c) = match &items[i + 1] {
285            Object::Stream(s) => decode_stream(s),
286            Object::String(s) => {
287                Some(std::string::String::from_utf8_lossy(s.as_bytes()).to_string())
288            }
289            _ => None,
290        } {
291            packets.packets.push((name, c));
292        }
293        i += 2;
294    }
295    packets
296}
297
298fn scan_for_xfa(pdf: &Pdf) -> Result<XfaPackets> {
299    // Cap the number of streams we decompress to avoid multi-second stalls on
300    // large non-XFA PDFs. XFA XDP streams are typically among the first few
301    // hundred objects. If we haven't found one after 2000 streams, give up.
302    let mut streams_checked = 0u32;
303    for obj in pdf.objects() {
304        if let Object::Stream(s) = obj {
305            streams_checked += 1;
306            if streams_checked > 2000 {
307                break;
308            }
309            if let Some(d) = decode_stream(&s) {
310                if d.contains("<xdp:xdp") {
311                    return Ok(parse_xfa_xml(&d));
312                }
313            }
314        }
315    }
316    Err(XfaError::PacketNotFound("no XFA content found".to_string()))
317}
318
319fn decode_stream(stream: &Stream<'_>) -> Option<String> {
320    std::string::String::from_utf8(stream.decoded().ok()?).ok()
321}
322
323fn parse_xfa_xml(xml: &str) -> XfaPackets {
324    let mut packets = XfaPackets {
325        full_xml: Some(xml.to_string()),
326        packets: Vec::new(),
327    };
328    let t = xml.trim();
329    let c = t.find("?>").map(|p| &t[p + 2..]).unwrap_or(t).trim();
330    let (inner, wrapper_xmlns) = match c.find('>') {
331        Some(s) => {
332            let opener = &c[..s]; // text between `<xdp:xdp` and the closing `>`
333            let wrapper_xmlns = extract_xmlns_decls(opener);
334            let rest = &c[s + 1..];
335            let inner = rest
336                .rfind("</xdp:xdp>")
337                .map(|e| &rest[..e])
338                .or_else(|| rest.rfind("</xdp>").map(|e| &rest[..e]))
339                .unwrap_or(rest);
340            (inner, wrapper_xmlns)
341        }
342        None => return packets,
343    };
344    let mut pos = 0;
345    let bytes = inner.as_bytes();
346    while pos < bytes.len() {
347        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
348            pos += 1;
349        }
350        if pos >= bytes.len() {
351            break;
352        }
353        if bytes[pos] != b'<' {
354            pos += 1;
355            continue;
356        }
357        if inner[pos..].starts_with("<!--") {
358            if let Some(e) = inner[pos..].find("-->") {
359                pos += e + 3;
360                continue;
361            }
362        }
363        if inner[pos..].starts_with("<?") {
364            if let Some(e) = inner[pos..].find("?>") {
365                pos += e + 2;
366                continue;
367            }
368        }
369        let ts = pos;
370        pos += 1;
371        let ns = pos;
372        while pos < bytes.len() && bytes[pos] != b'>' && bytes[pos] != b' ' && bytes[pos] != b'/' {
373            pos += 1;
374        }
375        let ft = &inner[ns..pos];
376        let pn = ft.split(':').next_back().unwrap_or(ft);
377        let ct = format!("</{ft}>");
378        let at = format!("</xfa:{pn}>");
379        if let Some(cp) = inner[ts..].find(ct.as_str()) {
380            let ee = ts + cp + ct.len();
381            let raw = &inner[ts..ee];
382            packets
383                .packets
384                .push((pn.to_string(), inject_missing_xmlns(raw, &wrapper_xmlns)));
385            pos = ee;
386        } else if let Some(cp) = inner[ts..].find(at.as_str()) {
387            let ee = ts + cp + at.len();
388            let raw = &inner[ts..ee];
389            packets
390                .packets
391                .push((pn.to_string(), inject_missing_xmlns(raw, &wrapper_xmlns)));
392            pos = ee;
393        } else {
394            while pos < bytes.len() && bytes[pos] != b'>' {
395                pos += 1;
396            }
397            pos += 1;
398        }
399    }
400    packets
401}
402
403// ─── Packet validation ───────────────────────────────────────────────────────
404
405/// Summary of what was found (or missing) in a set of [`XfaPackets`].
406///
407/// Returned by [`validate_xfa_packets`].  Intended for diagnostics, logging,
408/// and deciding how to handle unusual or incomplete XFA documents.
409#[derive(Debug, Clone, Default)]
410pub struct PacketValidation {
411    /// `true` when a `template` packet is present.
412    pub has_template: bool,
413    /// `true` when at least one `datasets` packet is present.
414    pub has_datasets: bool,
415    /// `true` when a `config` packet is present.
416    pub has_config: bool,
417    /// Byte length of the template packet (0 if absent).
418    pub template_bytes: usize,
419    /// Byte length of the largest datasets packet (0 if absent).
420    pub datasets_bytes: usize,
421    /// Names of all packets in document order.
422    pub packet_names: Vec<String>,
423    /// Human-readable warnings about missing or suspicious content.
424    pub warnings: Vec<String>,
425}
426
427/// Validate the contents of [`XfaPackets`] and return a [`PacketValidation`].
428///
429/// This function never panics and never fails — it always returns a result,
430/// even for empty or degenerate packet sets.
431pub fn validate_xfa_packets(packets: &XfaPackets) -> PacketValidation {
432    let has_template = packets.template().is_some();
433    let has_datasets = packets.datasets().is_some();
434    let has_config = packets.config().is_some();
435
436    let template_bytes = packets.template().map(|s| s.len()).unwrap_or(0);
437    let datasets_bytes = packets.datasets().map(|s| s.len()).unwrap_or(0);
438    let packet_names = packets.packets.iter().map(|(n, _)| n.clone()).collect();
439
440    let mut warnings = Vec::new();
441
442    if !has_template {
443        warnings.push("No template packet found".to_string());
444    } else if template_bytes < 100 {
445        warnings.push(format!(
446            "Template packet is empty (< 100 bytes) — only {template_bytes} bytes"
447        ));
448    }
449
450    if !has_datasets {
451        warnings.push("No datasets packet".to_string());
452    } else if datasets_bytes < 50 {
453        warnings.push(format!(
454            "Datasets packet is suspiciously small (< 50 bytes) — only {datasets_bytes} bytes"
455        ));
456    }
457
458    PacketValidation {
459        has_template,
460        has_datasets,
461        has_config,
462        template_bytes,
463        datasets_bytes,
464        packet_names,
465        warnings,
466    }
467}
468/// extract_embedded_fonts.
469// ─── Embedded font extraction ────────────────────────────────────────────────
470pub fn extract_embedded_fonts(pdf: &Pdf) -> Vec<(String, Vec<u8>)> {
471    use pdf_syntax::object::dict::keys::{FONT_FILE, FONT_FILE2, FONT_FILE3, FONT_NAME, TYPE};
472    use pdf_syntax::object::Name;
473    let mut fonts = Vec::new();
474    for obj in pdf.objects() {
475        let dict = match &obj {
476            Object::Dict(d) => d.clone(),
477            Object::Stream(s) => s.dict().clone(),
478            _ => continue,
479        };
480        if dict
481            .get::<Name>(TYPE)
482            .is_none_or(|n| n.as_ref() != b"FontDescriptor")
483        {
484            continue;
485        }
486        let name = dict
487            .get::<Name>(FONT_NAME)
488            .map(|n| std::string::String::from_utf8_lossy(n.as_ref()).to_string())
489            .unwrap_or_default();
490        for key in [FONT_FILE2, FONT_FILE, FONT_FILE3] {
491            if let Some(s) = dict.get::<Stream<'_>>(key) {
492                if let Ok(d) = s.decoded() {
493                    if !d.is_empty() {
494                        fonts.push((name.clone(), d));
495                        break;
496                    }
497                }
498            }
499        }
500    }
501    fonts
502}
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507    #[test]
508    fn parse_xfa_packets() {
509        let xml = r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="f1"><field name="T1"/></subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><f1><T1>Hi</T1></f1></xfa:data></xfa:datasets></xdp:xdp>"#;
510        let p = parse_xfa_xml(xml);
511        assert_eq!(p.packets.len(), 2);
512        assert!(p.template().is_some());
513        assert!(p.datasets().is_some());
514    }
515    #[test]
516    fn empty_xfa() {
517        let p = parse_xfa_xml(r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#);
518        assert_eq!(p.packets.len(), 0);
519    }
520
521    #[test]
522    fn get_packet_missing_returns_none() {
523        let p = parse_xfa_xml(r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#);
524        assert!(p.get_packet("template").is_none());
525        assert!(p.get_packet("nonexistent").is_none());
526        assert!(p.config().is_none());
527        assert!(p.locale_set().is_none());
528    }
529
530    #[test]
531    fn full_xml_preserved() {
532        // full_xml should always capture the entire input string.
533        let xml =
534            r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#;
535        let p = parse_xfa_xml(xml);
536        let stored = p.full_xml.as_deref().unwrap_or("");
537        assert!(stored.contains("xdp:xdp"));
538    }
539
540    #[test]
541    fn config_packet_parsed() {
542        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present><xdp><packets>*</packets></xdp></present></config></xdp:xdp>"#;
543        let p = parse_xfa_xml(xml);
544        assert_eq!(p.packets.len(), 1);
545        assert!(p.config().is_some());
546        assert!(p.template().is_none());
547    }
548
549    #[test]
550    fn multiple_packets_order_preserved() {
551        // template must come before datasets — order matches the XDP source order.
552        let xml = r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"/></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data/></xfa:datasets></xdp:xdp>"#;
553        let p = parse_xfa_xml(xml);
554        assert_eq!(p.packets.len(), 2);
555        assert_eq!(p.packets[0].0, "template");
556        assert_eq!(p.packets[1].0, "datasets");
557        assert!(p.template().is_some());
558        assert!(p.datasets().is_some());
559    }
560
561    // ── PacketValidation tests (issue #1085) ──────────────────────────────
562
563    #[test]
564    fn validate_complete_packets_no_warnings() {
565        let xml = r#"<?xml version="1.0"?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="firstName" xmlns:ui="http://www.xfa.org/schema/xfa-template/3.3/"><ui><textEdit/></ui></field></subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><root><firstName>Alice</firstName></root></xfa:data></xfa:datasets></xdp:xdp>"#;
566        let p = parse_xfa_xml(xml);
567        let v = validate_xfa_packets(&p);
568        assert!(v.has_template);
569        assert!(v.has_datasets);
570        assert!(v.template_bytes > 0);
571        assert!(v.datasets_bytes > 0);
572        assert!(
573            v.warnings.is_empty(),
574            "expected no warnings, got: {:?}",
575            v.warnings
576        );
577    }
578
579    #[test]
580    fn validate_missing_template_produces_warning() {
581        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data/></xfa:datasets></xdp:xdp>"#;
582        let p = parse_xfa_xml(xml);
583        let v = validate_xfa_packets(&p);
584        assert!(!v.has_template);
585        assert!(v
586            .warnings
587            .iter()
588            .any(|w| w.contains("No template packet found")));
589    }
590
591    #[test]
592    fn validate_missing_datasets_produces_warning() {
593        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="x"/><field name="y"/><field name="z"/><field name="w"/></subform></template></xdp:xdp>"#;
594        let p = parse_xfa_xml(xml);
595        let v = validate_xfa_packets(&p);
596        assert!(!v.has_datasets);
597        assert!(v.warnings.iter().any(|w| w.contains("No datasets packet")));
598    }
599
600    #[test]
601    fn validate_tiny_template_produces_warning() {
602        let mut p = XfaPackets::default();
603        p.packets.push(("template".to_string(), "<t/>".to_string()));
604        p.packets.push((
605            "datasets".to_string(),
606            "<xfa:datasets xmlns:xfa=\"http://www.xfa.org/schema/xfa-data/1.0/\"><xfa:data/></xfa:datasets>".to_string(),
607        ));
608        let v = validate_xfa_packets(&p);
609        assert!(v.warnings.iter().any(|w| w.contains("< 100 bytes")));
610    }
611
612    #[test]
613    fn validate_tiny_datasets_produces_warning() {
614        let mut p = XfaPackets::default();
615        // Give a substantial template so that warning comes from datasets only.
616        p.packets.push((
617            "template".to_string(),
618            "<template xmlns=\"http://www.xfa.org/schema/xfa-template/3.3/\"><subform name=\"root\"><field name=\"a\"/><field name=\"b\"/><field name=\"c\"/></subform></template>".to_string(),
619        ));
620        p.packets
621            .push(("datasets".to_string(), "<ds/>".to_string()));
622        let v = validate_xfa_packets(&p);
623        assert!(v.warnings.iter().any(|w| w.contains("< 50 bytes")));
624    }
625
626    #[test]
627    fn validate_packet_names_list() {
628        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present/></config><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="f1"/><field name="f2"/><field name="f3"/></subform></template></xdp:xdp>"#;
629        let p = parse_xfa_xml(xml);
630        let v = validate_xfa_packets(&p);
631        assert!(v.packet_names.contains(&"config".to_string()));
632        assert!(v.packet_names.contains(&"template".to_string()));
633        assert!(v.has_config);
634    }
635
636    // ── XFA corpus tests (issue #1086) ────────────────────────────────────
637    // Ten synthetic tests covering representative XFA document patterns.
638    // Each test uses small in-memory XML strings — no real PDFs required.
639
640    /// 1. Static XFA form detection via baseProfile.
641    #[test]
642    fn corpus_01_static_xfa_form_detection() {
643        use crate::classify::{detect_xfa_type_from_packets, XfaType};
644        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/" baseProfile="interactiveForms"><subform name="Page1"><field name="LastName"/><field name="FirstName"/></subform></template></xdp:xdp>"#;
645        let p = parse_xfa_xml(xml);
646        assert_eq!(detect_xfa_type_from_packets(&p), XfaType::Static);
647    }
648
649    /// 2. Dynamic XFA form detection (no baseProfile constraint).
650    #[test]
651    fn corpus_02_dynamic_xfa_form_detection() {
652        use crate::classify::{detect_xfa_type_from_packets, XfaType};
653        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><occur min="0" max="-1"/><field name="item"/></subform></template></xdp:xdp>"#;
654        let p = parse_xfa_xml(xml);
655        assert_eq!(detect_xfa_type_from_packets(&p), XfaType::Dynamic);
656    }
657
658    /// 3. XFA with multiple packets (template + datasets + config).
659    #[test]
660    fn corpus_03_multiple_packets() {
661        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present><xdp><packets>*</packets></xdp></present></config><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"/></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data/></xfa:datasets></xdp:xdp>"#;
662        let p = parse_xfa_xml(xml);
663        assert_eq!(p.packets.len(), 3, "should have config, template, datasets");
664        assert!(p.config().is_some());
665        assert!(p.template().is_some());
666        assert!(p.datasets().is_some());
667    }
668
669    /// 4. XFA with no datasets (template-only — blank form, no data bound).
670    #[test]
671    fn corpus_04_template_only_no_datasets() {
672        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="LastName"/><field name="FirstName"/><field name="DOB"/></subform></template></xdp:xdp>"#;
673        let p = parse_xfa_xml(xml);
674        assert!(p.template().is_some());
675        assert!(p.datasets().is_none());
676        let v = validate_xfa_packets(&p);
677        assert!(!v.has_datasets);
678        assert!(v.warnings.iter().any(|w| w.contains("No datasets")));
679    }
680
681    /// 5. XFA with binary-like image data embedded in datasets (base64 blob).
682    #[test]
683    fn corpus_05_xfa_with_image_data_in_datasets() {
684        // Simulate datasets containing a base64-encoded image field.
685        let b64_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
686        let xml = format!(
687            r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="photo"><ui><imageEdit/></ui></field></subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><root><photo contentType="image/png" href="">{b64_image}</photo></root></xfa:data></xfa:datasets></xdp:xdp>"#
688        );
689        let p = parse_xfa_xml(&xml);
690        assert!(p.template().is_some());
691        assert!(p.datasets().is_some());
692        let ds = p.datasets().unwrap();
693        assert!(ds.contains(b64_image), "datasets should contain image data");
694    }
695
696    /// 6. Non-XFA PDF (empty bytes) returns XfaType::None.
697    #[test]
698    fn corpus_06_non_xfa_pdf_returns_none() {
699        use crate::classify::{detect_xfa_type, XfaType};
700        // A plain PDF header with no AcroForm/XFA.
701        let not_xfa: &[u8] = b"%PDF-1.4\n%%EOF";
702        assert_eq!(detect_xfa_type(not_xfa), XfaType::None);
703    }
704
705    /// 7. XFA with config packet — config is correctly parsed and accessible.
706    #[test]
707    fn corpus_07_xfa_with_config_packet() {
708        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><config xmlns="http://www.xfa.org/schema/xci/3.1/"><present><xdp><packets>*</packets></xdp></present><pdf><version>1.6</version></pdf></config><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"/></template></xdp:xdp>"#;
709        let p = parse_xfa_xml(xml);
710        assert!(p.config().is_some());
711        let cfg = p.config().unwrap();
712        assert!(cfg.contains("packets"));
713        let v = validate_xfa_packets(&p);
714        assert!(v.has_config);
715    }
716
717    /// 8. Empty datasets packet (incremental save pattern — original blank form).
718    #[test]
719    fn corpus_08_empty_datasets_incremental_save_pattern() {
720        // Two datasets entries: original blank (small) and filled (larger).
721        let mut p = XfaPackets::default();
722        p.packets.push((
723            "template".to_string(),
724            "<template xmlns=\"http://www.xfa.org/schema/xfa-template/3.3/\"><subform name=\"root\"><field name=\"qty\"/><field name=\"price\"/><field name=\"total\"/></subform></template>".to_string(),
725        ));
726        // Blank (incremental save artefact — very small):
727        p.packets.push((
728            "datasets".to_string(),
729            "<xfa:datasets xmlns:xfa=\"http://www.xfa.org/schema/xfa-data/1.0/\"/>".to_string(),
730        ));
731        // Filled (the real data):
732        p.packets.push(("datasets".to_string(), "<xfa:datasets xmlns:xfa=\"http://www.xfa.org/schema/xfa-data/1.0/\"><xfa:data><root><qty>3</qty><price>9.99</price><total>29.97</total></root></xfa:data></xfa:datasets>".to_string()));
733        // datasets() must return the LARGEST entry.
734        let ds = p.datasets().expect("datasets should exist");
735        assert!(
736            ds.contains("29.97"),
737            "should return the larger/filled datasets"
738        );
739    }
740
741    /// 9. Large template with many fields — validation should have no warnings.
742    #[test]
743    fn corpus_09_large_template_many_fields() {
744        // Build a template with 20 fields to ensure validation handles size correctly.
745        let fields: String = (1..=20)
746            .map(|i| format!("<field name=\"field{i}\"><ui><textEdit/></ui></field>"))
747            .collect();
748        let xml = format!(
749            r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root">{fields}</subform></template><xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"><xfa:data><root>{}</root></xfa:data></xfa:datasets></xdp:xdp>"#,
750            (1..=20)
751                .map(|i| format!("<field{i}>val{i}</field{i}>"))
752                .collect::<String>()
753        );
754        let p = parse_xfa_xml(&xml);
755        let v = validate_xfa_packets(&p);
756        assert!(v.has_template);
757        assert!(v.has_datasets);
758        assert!(
759            v.template_bytes >= 100,
760            "large template should exceed 100 bytes"
761        );
762        assert!(
763            v.warnings.is_empty(),
764            "no warnings expected: {:?}",
765            v.warnings
766        );
767    }
768
769    /// 10. XFA with localeSet packet — localeSet is correctly accessible.
770    #[test]
771    fn corpus_10_xfa_with_locale_set_packet() {
772        let xml = r#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><localeSet xmlns="http://www.xfa.org/schema/xfa-locale-set/2.7/"><locale name="en_US" desc="English (United States)"><calendarSymbols name="gregorian"/></locale></localeSet><template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="root"><field name="date"/></subform></template></xdp:xdp>"#;
773        let p = parse_xfa_xml(xml);
774        assert!(
775            p.locale_set().is_some(),
776            "localeSet packet should be accessible"
777        );
778        assert!(p.template().is_some());
779        let ls = p.locale_set().unwrap();
780        assert!(ls.contains("en_US"));
781    }
782}