1use lopdf::{dictionary, Dictionary, Document, Object, ObjectId, Stream, StringFormat};
54use std::cell::Cell;
55use std::collections::{HashMap, HashSet};
56use std::fmt::Write as FmtWrite;
57#[cfg(not(target_arch = "wasm32"))]
58use std::thread;
59#[cfg(not(target_arch = "wasm32"))]
60use std::time::Duration;
61
62thread_local! {
77 static FLATTEN_DEPTH: Cell<u32> = const { Cell::new(0) };
78}
79
80#[cfg(feature = "xfa-js-sandboxed")]
81use crate::dynamic::apply_dynamic_scripts_with_runtime;
82use crate::dynamic::{
83 apply_dynamic_scripts, apply_dynamic_scripts_with_mode, runtime_diag_enabled,
84 DynamicScriptOutcome, FormDomMatchEntry, JsExecutionMode, OutputQuality,
85};
86use crate::error::{Result, XfaError};
87use crate::extract::extract_xfa_from_bytes;
88use crate::flatten_trace;
89use crate::font_bridge::{
90 font_variant_key, pdf_glyph_name_to_unicode, CidFontInfo, EmbeddedFontData, PdfBaseEncoding,
91 PdfSimpleEncoding, PdfSourceFont, ResolvedFont, XfaFontResolver, XfaFontSpec,
92};
93use crate::image_bridge::embed_image;
94use crate::javascript_policy::{self, JavaScriptEntryPoint};
95use crate::merger::FormMerger;
96use crate::render_bridge::{
97 generate_all_overlays, generate_field_values_overlays, unicode_to_winansi, FontMetricsData,
98 PageOverlay, XfaRenderConfig,
99};
100use xfa_dom_resolver::data_dom::DataDom;
101use xfa_layout_engine::form::{DrawContent, FormNodeId, FormNodeStyle, FormTree};
102use xfa_layout_engine::layout::{
103 LayoutContent, LayoutDom, LayoutEngine, LayoutNode, LayoutProfile,
104};
105
106#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
121enum PipelineStage {
122 Extract = 0,
123 Bind = 1,
124 Layout = 2,
125 Render = 3,
126 Embed = 4,
127 Write = 5,
128 Cleanup = 6,
129}
130
131fn create_minimal_pdf_document() -> Document {
132 let mut doc = Document::new();
133 let pages_id = doc.add_object(Object::Dictionary(dictionary! {
134 "Type" => Object::Name(b"Pages".to_vec()),
135 "Kids" => Object::Array(vec![]),
136 "Count" => Object::Integer(0)
137 }));
138 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
139 "Type" => Object::Name(b"Catalog".to_vec()),
140 "Pages" => Object::Reference(pages_id)
141 }));
142 doc.trailer.set("Root", Object::Reference(catalog_id));
143 doc
144}
145
146#[derive(Debug, Clone, Default)]
148pub struct LayoutDump {
149 pub pages: Vec<LayoutDumpEntry>,
151 pub dynamic_scripts: DynamicScriptOutcome,
153 pub output_quality: OutputQuality,
155}
156
157#[derive(Debug, Clone)]
159pub struct LayoutDumpEntry {
160 pub page_num: u32,
162 pub page_height: f64,
164 pub used_height: f64,
166 pub overflow_to_next: bool,
168 pub first_overflow_element: Option<String>,
170}
171
172#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
189pub enum XfaRenderingPolicy {
190 #[default]
194 SavedStateFaithful,
195 FreshMergeExperimental,
203}
204
205impl XfaRenderingPolicy {
206 #[must_use]
209 pub const fn as_str(self) -> &'static str {
210 match self {
211 Self::SavedStateFaithful => "saved_state_faithful",
212 Self::FreshMergeExperimental => "fresh_merge_experimental",
213 }
214 }
215
216 #[must_use]
219 pub fn from_token(token: &str) -> Option<Self> {
220 match token.trim().to_ascii_lowercase().as_str() {
221 "saved-state" | "saved_state" | "saved_state_faithful" | "savedstatefaithful" => {
222 Some(Self::SavedStateFaithful)
223 }
224 "fresh-merge"
225 | "fresh_merge"
226 | "fresh_merge_experimental"
227 | "freshmergeexperimental" => Some(Self::FreshMergeExperimental),
228 _ => None,
229 }
230 }
231
232 #[must_use]
237 pub const fn is_supported(self) -> bool {
238 true
239 }
240}
241
242#[derive(Debug, Clone, PartialEq, Eq, Default)]
244pub struct FlattenMetadata {
245 pub dynamic_scripts: DynamicScriptOutcome,
247 pub output_quality: OutputQuality,
249 pub rendering_policy: XfaRenderingPolicy,
251 pub fresh_merge_admitted_nodes: usize,
263}
264
265impl FlattenMetadata {
266 fn from_dynamic_scripts(dynamic_scripts: DynamicScriptOutcome) -> Self {
267 let output_quality = dynamic_scripts.output_quality;
268 Self {
269 dynamic_scripts,
270 output_quality,
271 rendering_policy: XfaRenderingPolicy::SavedStateFaithful,
272 fresh_merge_admitted_nodes: 0,
273 }
274 }
275}
276
277struct FlattenOutput {
278 pdf_bytes: Vec<u8>,
279 layout_dump: LayoutDump,
280 metadata: FlattenMetadata,
281}
282
283impl FlattenOutput {
284 fn new(
285 pdf_bytes: Vec<u8>,
286 mut layout_dump: LayoutDump,
287 dynamic_scripts: DynamicScriptOutcome,
288 ) -> Self {
289 let output_quality = dynamic_scripts.output_quality;
290 let metadata = FlattenMetadata::from_dynamic_scripts(dynamic_scripts.clone());
291 layout_dump.dynamic_scripts = dynamic_scripts;
292 layout_dump.output_quality = output_quality;
293 Self {
294 pdf_bytes,
295 layout_dump,
296 metadata,
297 }
298 }
299
300 fn without_dump(pdf_bytes: Vec<u8>) -> Self {
301 Self::new(
302 pdf_bytes,
303 LayoutDump::default(),
304 DynamicScriptOutcome::default(),
305 )
306 }
307}
308
309pub fn is_pdf_encrypted(pdf_bytes: &[u8]) -> bool {
311 Document::load_mem(pdf_bytes)
312 .map(|doc| doc.trailer.get(b"Encrypt").is_ok())
313 .unwrap_or(false)
314}
315
316enum DecryptResult {
317 NotEncrypted,
318 Decrypted(Vec<u8>),
319 NeedsPassword,
320}
321
322fn try_decrypt_pdf(pdf_bytes: &[u8]) -> DecryptResult {
325 let mut doc = match Document::load_mem(pdf_bytes) {
326 Ok(d) => d,
327 Err(_) => return DecryptResult::NotEncrypted, };
329
330 if doc.was_encrypted() {
334 let mut buf = Vec::new();
336 match doc.save_to(&mut buf) {
337 Ok(()) => return DecryptResult::Decrypted(buf),
338 Err(_) => return DecryptResult::NeedsPassword,
339 }
340 }
341
342 if doc.trailer.get(b"Encrypt").is_ok() {
343 match Document::load_mem_with_password(pdf_bytes, "") {
345 Ok(mut decrypted_doc) => {
346 decrypted_doc.trailer.remove(b"Encrypt");
347 let mut buf = Vec::new();
348 match decrypted_doc.save_to(&mut buf) {
349 Ok(()) => return DecryptResult::Decrypted(buf),
350 Err(_) => return DecryptResult::NeedsPassword,
351 }
352 }
353 Err(_) => return DecryptResult::NeedsPassword,
354 }
355 }
356
357 DecryptResult::NotEncrypted
358}
359
360fn page_has_fields(nodes: &[LayoutNode], tree: &FormTree) -> bool {
374 use xfa_layout_engine::form::{FieldKind, FormNodeType};
375 nodes.iter().any(|n| {
376 let is_data_field = matches!(tree.get(n.form_node).node_type, FormNodeType::Field { .. })
379 && !matches!(
380 tree.meta(n.form_node).field_kind,
381 FieldKind::Signature | FieldKind::Button | FieldKind::Barcode
382 );
383 is_data_field || page_has_fields(&n.children, tree)
384 })
385}
386
387fn page_has_field_data(nodes: &[LayoutNode], tree: &FormTree) -> bool {
392 use xfa_layout_engine::form::FormNodeType;
393 nodes.iter().any(|n| {
394 matches!(
395 &tree.get(n.form_node).node_type,
396 FormNodeType::Field { value } if !value.is_empty()
397 ) || page_has_field_data(&n.children, tree)
398 })
399}
400
401fn page_field_counts(nodes: &[LayoutNode], tree: &FormTree) -> (usize, usize, usize) {
403 use xfa_layout_engine::form::FormNodeType;
404 let mut total = 0;
405 let mut empty = 0;
406 let mut nonempty = 0;
407 for n in nodes {
408 if let FormNodeType::Field { value } = &tree.get(n.form_node).node_type {
409 total += 1;
410 if value.trim().is_empty() {
411 empty += 1;
412 } else {
413 nonempty += 1;
414 }
415 }
416 let (t, e, ne) = page_field_counts(&n.children, tree);
417 total += t;
418 empty += e;
419 nonempty += ne;
420 }
421 (total, empty, nonempty)
422}
423
424fn page_static_draw_chars(nodes: &[LayoutNode]) -> usize {
427 let mut total = 0usize;
428 for n in nodes {
429 match &n.content {
430 LayoutContent::Text(t) => total += t.chars().count(),
431 LayoutContent::Draw(DrawContent::Text(t)) => total += t.chars().count(),
432 LayoutContent::WrappedText {
433 lines, from_field, ..
434 } if !*from_field => {
435 total += lines.iter().map(|l| l.chars().count()).sum::<usize>();
436 }
437 _ => {}
438 }
439 total += page_static_draw_chars(&n.children);
440 }
441 total
442}
443
444fn page_has_visible_content(nodes: &[LayoutNode]) -> bool {
455 nodes.iter().any(|n| {
456 let self_visible = match &n.content {
457 LayoutContent::None => false,
458 LayoutContent::Text(t) => !t.trim().is_empty(),
459 LayoutContent::WrappedText { lines, .. } => lines.iter().any(|l| !l.trim().is_empty()),
460 LayoutContent::Draw(DrawContent::Text(t)) => !t.trim().is_empty(),
461 LayoutContent::Draw(_) => true,
463 LayoutContent::Field { .. } => true,
466 LayoutContent::Image { .. } => true,
467 };
468 self_visible || page_has_visible_content(&n.children)
469 })
470}
471
472fn suppression_trust_layout_enabled() -> bool {
483 matches!(std::env::var("XFA_SUPPRESSION_TRUST_LAYOUT"), Ok(v) if {
484 let v = v.trim();
485 v == "1" || v.eq_ignore_ascii_case("on") || v.eq_ignore_ascii_case("true")
486 })
487}
488
489fn harvest_mode_enabled() -> bool {
499 matches!(std::env::var("XFA_JS_HARVEST_MODE"), Ok(v) if {
500 let v = v.trim();
501 v == "1" || v.eq_ignore_ascii_case("on") || v.eq_ignore_ascii_case("true")
502 })
503}
504
505fn snapshot_nonempty_field_ids(tree: &FormTree) -> HashSet<FormNodeId> {
510 use xfa_layout_engine::form::FormNodeType;
511 let mut ids = HashSet::new();
512 for i in 0..tree.nodes.len() {
513 let id = FormNodeId(i);
514 if let FormNodeType::Field { value } = &tree.get(id).node_type {
515 if !value.trim().is_empty() {
516 ids.insert(id);
517 }
518 }
519 }
520 ids
521}
522
523fn page_has_field_data_snapshot(nodes: &[LayoutNode], snapshot: &HashSet<FormNodeId>) -> bool {
526 nodes.iter().any(|n| {
527 snapshot.contains(&n.form_node) || page_has_field_data_snapshot(&n.children, snapshot)
528 })
529}
530
531fn page_form_node_signature(nodes: &[LayoutNode], out: &mut Vec<usize>) {
535 for n in nodes {
536 out.push(n.form_node.0);
537 page_form_node_signature(&n.children, out);
538 }
539}
540
541fn build_parent_map(tree: &FormTree) -> Vec<usize> {
546 let mut parent = vec![usize::MAX; tree.nodes.len()];
547 for (pid, node) in tree.nodes.iter().enumerate() {
548 for &child in &node.children {
549 if child.0 < parent.len() {
550 parent[child.0] = pid;
551 }
552 }
553 }
554 parent
555}
556
557fn page_data_bound_count(nodes: &[LayoutNode], tree: &FormTree) -> usize {
559 let mut c = 0;
560 for n in nodes {
561 if tree.meta(n.form_node).bound_data_node.is_some() {
562 c += 1;
563 }
564 c += page_data_bound_count(&n.children, tree);
565 }
566 c
567}
568
569fn page_repeating_ancestor(
572 distinct_ids: &[usize],
573 tree: &FormTree,
574 parent_map: &[usize],
575) -> Option<usize> {
576 use xfa_layout_engine::form::FormNodeId;
577 for &start in distinct_ids {
578 let mut cur = start;
579 let mut depth = 0;
580 while cur != usize::MAX && depth < 4096 {
581 if cur < tree.nodes.len() && tree.get(FormNodeId(cur)).occur.is_repeating() {
582 return Some(cur);
583 }
584 cur = parent_map.get(cur).copied().unwrap_or(usize::MAX);
585 depth += 1;
586 }
587 }
588 None
589}
590
591fn compute_suppression_diags(
592 layout: &LayoutDom,
593 tree: &FormTree,
594 pre_js_nonempty: Option<&HashSet<FormNodeId>>,
595) -> Vec<flatten_trace::PageSuppressionDiag> {
596 let parent_map = build_parent_map(tree);
597 let n = layout.pages.len();
598 let trust_layout = suppression_trust_layout_enabled();
599 let raw: Vec<(bool, bool, bool)> = layout
603 .pages
604 .iter()
605 .map(|p| {
606 let hd = match pre_js_nonempty {
607 Some(snap) => page_has_field_data_snapshot(&p.nodes, snap),
608 None => page_has_field_data(&p.nodes, tree),
609 };
610 (p.runtime_instantiated, page_has_fields(&p.nodes, tree), hd)
611 })
612 .collect();
613 let raw_keep = |i: usize| -> bool {
614 let (rt, hf, hd) = raw[i];
618 rt || hd || !hf
619 };
620 let any_keep = (0..n).any(raw_keep);
621
622 let mut sigs: Vec<Vec<usize>> = Vec::with_capacity(n);
624 for p in &layout.pages {
625 let mut s = Vec::new();
626 page_form_node_signature(&p.nodes, &mut s);
627 s.sort_unstable();
628 s.dedup();
629 sigs.push(s);
630 }
631
632 let mut diags = Vec::with_capacity(n);
633 for i in 0..n {
634 let (rt, hf, hd) = raw[i];
635 let (fc, ef, nf) = page_field_counts(&layout.pages[i].nodes, tree);
636 let static_chars = page_static_draw_chars(&layout.pages[i].nodes);
637 let dup = (0..i)
638 .find(|&j| sigs[j] == sigs[i])
639 .map_or(-1, |j| j as i64);
640 let (keep, reason) = if n <= 1 {
641 (true, "single_page")
642 } else if rt {
643 (true, "runtime_instantiated")
644 } else if hf && hd {
645 (true, "has_field_data")
646 } else if !hf {
647 (true, "no_fields_static_kept")
648 } else if any_keep {
649 if trust_layout && page_has_visible_content(&layout.pages[i].nodes) {
652 (true, "trust_layout_kept")
653 } else {
654 (false, "data_empty_dropped")
655 }
656 } else {
657 (true, "all_empty_kept")
658 };
659
660 let data_bound = page_data_bound_count(&layout.pages[i].nodes, tree);
662 let repeating_ancestor = page_repeating_ancestor(&sigs[i], tree, &parent_map);
663 let under_repeating = repeating_ancestor.is_some();
664 let occur_template_id = repeating_ancestor.map_or(-1, |id| id as i64);
665 let has_data = nf > 0 || data_bound > 0;
666 let page_reason = if rt {
667 "root_page"
668 } else if under_repeating && !has_data {
669 "repeated_empty_instance"
670 } else if under_repeating {
671 "occur_instance"
672 } else if has_data {
673 "continuation"
674 } else if static_chars > 0 {
675 "static_page_area"
676 } else {
677 "unknown"
678 };
679 let suppression_safe_to_drop = page_reason == "repeated_empty_instance";
680 let provenance_confidence = if rt || under_repeating || has_data {
681 "exact"
682 } else if static_chars > 0 {
683 "inferred"
684 } else {
685 "unknown"
686 };
687
688 diags.push(flatten_trace::PageSuppressionDiag {
689 page_index: i,
690 keep,
691 reason,
692 field_count: fc,
693 empty_field_count: ef,
694 nonempty_field_count: nf,
695 static_draw_text_chars: static_chars,
696 distinct_form_nodes: sigs[i].len(),
697 duplicate_of_page: dup,
698 runtime_instantiated: rt,
699 under_repeating_subform: under_repeating,
700 occur_template_id,
701 data_bound_nodes_count: data_bound,
702 page_reason,
703 suppression_safe_to_drop,
704 provenance_confidence,
705 });
706 }
707 diags
708}
709
710#[must_use = "flattened PDF bytes must be used; discarding them loses output"]
753pub fn flatten_xfa_to_pdf(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
754 flatten_xfa_to_pdf_internal(pdf_bytes, false, XfaRenderingPolicy::SavedStateFaithful)
755 .map(|out| out.pdf_bytes)
756}
757#[must_use = "flattened PDF bytes and layout dump must be used; discarding them loses output"]
766pub fn flatten_xfa_to_pdf_with_layout_dump(pdf_bytes: &[u8]) -> Result<(Vec<u8>, LayoutDump)> {
767 let out = flatten_xfa_to_pdf_internal(pdf_bytes, true, XfaRenderingPolicy::SavedStateFaithful)?;
768 Ok((out.pdf_bytes, out.layout_dump))
769}
770
771#[must_use = "flattened PDF bytes and metadata must be used; discarding them loses output"]
779pub fn flatten_xfa_to_pdf_with_metadata(pdf_bytes: &[u8]) -> Result<(Vec<u8>, FlattenMetadata)> {
780 let out =
781 flatten_xfa_to_pdf_internal(pdf_bytes, false, XfaRenderingPolicy::SavedStateFaithful)?;
782 Ok((out.pdf_bytes, out.metadata))
783}
784
785#[must_use = "flattened PDF bytes, layout dump, and metadata must be used; discarding them loses output"]
794pub fn flatten_xfa_to_pdf_with_layout_dump_and_metadata(
795 pdf_bytes: &[u8],
796) -> Result<(Vec<u8>, LayoutDump, FlattenMetadata)> {
797 let out = flatten_xfa_to_pdf_internal(pdf_bytes, true, XfaRenderingPolicy::SavedStateFaithful)?;
798 Ok((out.pdf_bytes, out.layout_dump, out.metadata))
799}
800
801#[must_use = "flattened PDF bytes must be used; discarding them loses output"]
812pub fn flatten_xfa_to_pdf_with_policy(
813 pdf_bytes: &[u8],
814 policy: XfaRenderingPolicy,
815) -> Result<Vec<u8>> {
816 flatten_xfa_to_pdf_with_policy_and_metadata(pdf_bytes, policy).map(|(bytes, _)| bytes)
817}
818
819#[must_use = "flattened PDF bytes and metadata must be used; discarding them loses output"]
828pub fn flatten_xfa_to_pdf_with_policy_and_metadata(
829 pdf_bytes: &[u8],
830 policy: XfaRenderingPolicy,
831) -> Result<(Vec<u8>, FlattenMetadata)> {
832 let out = flatten_xfa_to_pdf_internal(pdf_bytes, false, policy)?;
834 let mut metadata = out.metadata;
835 metadata.rendering_policy = policy;
836 Ok((out.pdf_bytes, metadata))
837}
838
839fn flatten_xfa_to_pdf_internal(
840 pdf_bytes: &[u8],
841 collect_layout_dump: bool,
842 policy: XfaRenderingPolicy,
843) -> Result<FlattenOutput> {
844 let depth = FLATTEN_DEPTH.with(|d| d.get());
853 if depth >= 1 {
854 return Err(XfaError::LayoutFailed(
855 "flatten_xfa_to_pdf called recursively — aborting to prevent stack overflow".into(),
856 ));
857 }
858 FLATTEN_DEPTH.with(|d| d.set(depth + 1));
859 struct DepthGuard;
861 impl Drop for DepthGuard {
862 fn drop(&mut self) {
863 FLATTEN_DEPTH.with(|d| d.set(d.get().saturating_sub(1)));
864 }
865 }
866 let _depth_guard = DepthGuard;
867
868 if !pdf_bytes.windows(9).any(|w| w == b"/AcroForm")
872 && !pdf_bytes.windows(7).any(|w| w == b"xdp:xdp")
873 {
874 return Ok(FlattenOutput::without_dump(pdf_bytes.to_vec()));
875 }
876
877 let decrypted;
880 let pdf_bytes = match try_decrypt_pdf(pdf_bytes) {
881 DecryptResult::NotEncrypted => pdf_bytes,
882 DecryptResult::Decrypted(bytes) => {
883 decrypted = bytes;
884 &decrypted
885 }
886 DecryptResult::NeedsPassword => {
887 return Err(XfaError::Encrypted(
888 "PDF is encrypted and requires a password".into(),
889 ));
890 }
891 };
892
893 let packets = match extract_xfa_from_bytes(pdf_bytes.to_vec()) {
895 Ok(p) => p,
896 Err(_) => {
897 return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
901 }
902 };
903
904 let template_xml = match packets.template() {
905 Some(t) => strip_undefined_xml_entities(t),
906 None => {
907 return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
910 }
911 };
912
913 if is_corrupt_xfa_template(pdf_bytes.len(), &template_xml) {
917 return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
918 }
919
920 let datasets_xml_owned = packets.datasets().map(strip_undefined_xml_entities);
928 let form_xml_owned = packets.get_packet("form").map(|s| s.to_string());
929
930 #[cfg(not(target_arch = "wasm32"))]
931 {
932 const FLATTEN_TIMEOUT: Duration = Duration::from_secs(30);
936 let pdf_bytes_ref = pdf_bytes.to_vec();
937 let template_xml_owned = template_xml.clone();
938
939 let handle = thread::spawn(move || {
940 xfa_flatten_inner(
941 &pdf_bytes_ref,
942 &template_xml_owned,
943 datasets_xml_owned.as_deref(),
944 form_xml_owned.as_deref(),
945 collect_layout_dump,
946 policy,
947 )
948 });
949
950 match handle.join() {
951 Ok(Ok(out)) => Ok(out),
952 Ok(Err(e @ XfaError::UnsupportedFeature(_))) => Err(e),
953 Ok(Err(e)) => {
954 eprintln!("XFA flatten failed: {e:?}");
955 static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
956 }
957 Err(_) => {
958 eprintln!("XFA flatten timed out after {:?}", FLATTEN_TIMEOUT);
959 static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
960 }
961 }
962 }
963
964 #[cfg(target_arch = "wasm32")]
968 {
969 match xfa_flatten_inner(
970 pdf_bytes,
971 &template_xml,
972 datasets_xml_owned.as_deref(),
973 form_xml_owned.as_deref(),
974 collect_layout_dump,
975 policy,
976 ) {
977 Ok(out) => Ok(out),
978 Err(e @ XfaError::UnsupportedFeature(_)) => Err(e),
979 Err(e) => {
980 eprintln!("XFA flatten failed: {e:?}");
981 static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
982 }
983 }
984 }
985}
986
987#[cfg(feature = "xfa-js-sandboxed")]
994fn collect_declared_container_names(template_xml: &str) -> std::collections::HashSet<String> {
995 let mut names = std::collections::HashSet::new();
996 if let Ok(doc) = roxmltree::Document::parse(template_xml) {
997 for node in doc.descendants() {
998 if node.is_element()
999 && matches!(
1000 node.tag_name().name(),
1001 "subform" | "subformSet" | "exclGroup" | "area"
1002 )
1003 {
1004 if let Some(name) = node.attribute("name") {
1005 if !name.is_empty() {
1006 names.insert(name.to_string());
1007 }
1008 }
1009 }
1010 }
1011 }
1012 names
1013}
1014
1015fn xfa_flatten_inner(
1017 pdf_bytes: &[u8],
1018 template_xml: &str,
1019 datasets_xml: Option<&str>,
1020 form_xml: Option<&str>,
1021 collect_layout_dump: bool,
1022 policy: XfaRenderingPolicy,
1023) -> Result<FlattenOutput> {
1024 let mut _stage = PipelineStage::Extract;
1027
1028 log::debug!(
1030 "XFA flatten: {} bytes input, template={} bytes",
1031 pdf_bytes.len(),
1032 template_xml.len()
1033 );
1034
1035 let data_dom = if let Some(ds_xml) = datasets_xml {
1036 DataDom::from_xml(ds_xml)
1037 .map_err(|e| XfaError::ParseFailed(format!("datasets parse: {e}")))?
1038 } else {
1039 DataDom::new()
1040 };
1041
1042 let image_files = match Document::load_mem(pdf_bytes) {
1045 Ok(doc) => extract_embedded_images(&doc),
1046 Err(_) => HashMap::new(),
1047 };
1048
1049 if template_xml.contains("barcode") {
1053 log::warn!("XFA barcode elements found but not supported — rendered as empty boxes");
1054 }
1055 if template_xml.contains("<signature") || template_xml.contains("<Signature") {
1056 log::warn!("XFA signature elements found but not supported — elements skipped");
1057 }
1058 if javascript_policy::template_mentions_javascript(template_xml) {
1059 log::warn!(
1060 "{}",
1061 javascript_policy::execution_denied_message(JavaScriptEntryPoint::XfaEventHook)
1062 );
1063 }
1064
1065 debug_assert!(
1067 _stage <= PipelineStage::Bind,
1068 "pipeline stage order violated: expected <= Bind"
1069 );
1070 _stage = PipelineStage::Bind;
1071
1072 let trace_image_files = image_files.len();
1074 let merger = FormMerger::new(&data_dom).with_image_files(image_files);
1075 let (mut tree, root_id) = merger
1076 .merge(template_xml)
1077 .map_err(|e| XfaError::ParseFailed(format!("template merge: {e}")))?;
1078
1079 log::debug!("XFA bind: {} form nodes created", tree.nodes.len());
1080
1081 let pre_js_nonempty_fields: Option<HashSet<FormNodeId>> = if harvest_mode_enabled() {
1097 Some(snapshot_nonempty_field_ids(&tree))
1098 } else {
1099 None
1100 };
1101 let dynamic_scripts = match std::env::var("XFA_JS_EXECUTION_MODE")
1102 .ok()
1103 .map(|s| s.to_ascii_lowercase())
1104 .as_deref()
1105 {
1106 Some("strict") => {
1107 apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::Strict)?
1108 }
1109 Some("sandboxed") | Some("sandboxed_runtime") => {
1110 #[cfg(feature = "xfa-js-sandboxed")]
1114 {
1115 use crate::js_runtime::{NullRuntime, QuickJsRuntime, XfaJsRuntime};
1116 match QuickJsRuntime::new() {
1117 Ok(mut rt) => {
1118 rt.set_data_handle(&data_dom as *const _);
1119 rt.set_declared_subform_names(collect_declared_container_names(
1125 template_xml,
1126 ));
1127 apply_dynamic_scripts_with_runtime(
1128 &mut tree,
1129 root_id,
1130 JsExecutionMode::SandboxedRuntime,
1131 &mut rt,
1132 )?
1133 }
1134 Err(_) => apply_dynamic_scripts_with_runtime(
1135 &mut tree,
1136 root_id,
1137 JsExecutionMode::SandboxedRuntime,
1138 &mut NullRuntime::new(),
1139 )?,
1140 }
1141 }
1142 #[cfg(not(feature = "xfa-js-sandboxed"))]
1143 apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::SandboxedRuntime)?
1144 }
1145 _ => apply_dynamic_scripts(&mut tree, root_id)?,
1146 };
1147 if dynamic_scripts.output_quality != OutputQuality::Exact {
1148 log::warn!(
1152 "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={} js_unsupported_host_calls={} js_probe_skips={}",
1153 dynamic_scripts.output_quality.as_str(),
1154 dynamic_scripts.js_present,
1155 dynamic_scripts.js_skipped,
1156 dynamic_scripts.other_skipped,
1157 dynamic_scripts.formcalc_run,
1158 dynamic_scripts.formcalc_errors,
1159 dynamic_scripts.js_executed,
1160 dynamic_scripts.js_runtime_errors,
1161 dynamic_scripts.js_timeouts,
1162 dynamic_scripts.js_oom,
1163 dynamic_scripts.js_host_calls,
1164 dynamic_scripts.js_mutations,
1165 dynamic_scripts.js_instance_writes,
1166 dynamic_scripts.js_list_writes,
1167 dynamic_scripts.js_binding_errors,
1168 dynamic_scripts.js_resolve_failures,
1169 dynamic_scripts.js_data_reads,
1170 dynamic_scripts.js_unsupported_host_calls,
1171 dynamic_scripts.js_probe_skips,
1172 );
1173 eprintln!(
1174 "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={} js_unsupported_host_calls={} js_probe_skips={}",
1175 dynamic_scripts.output_quality.as_str(),
1176 dynamic_scripts.js_present,
1177 dynamic_scripts.js_skipped,
1178 dynamic_scripts.other_skipped,
1179 dynamic_scripts.formcalc_run,
1180 dynamic_scripts.formcalc_errors,
1181 dynamic_scripts.js_executed,
1182 dynamic_scripts.js_runtime_errors,
1183 dynamic_scripts.js_timeouts,
1184 dynamic_scripts.js_oom,
1185 dynamic_scripts.js_host_calls,
1186 dynamic_scripts.js_mutations,
1187 dynamic_scripts.js_instance_writes,
1188 dynamic_scripts.js_list_writes,
1189 dynamic_scripts.js_binding_errors,
1190 dynamic_scripts.js_resolve_failures,
1191 dynamic_scripts.js_data_reads,
1192 dynamic_scripts.js_unsupported_host_calls,
1193 dynamic_scripts.js_probe_skips,
1194 );
1195 }
1196
1197 let admit_databound_override = std::env::var("XFA_FORMDOM_ADMIT_DATABOUND")
1210 .map(|v| {
1211 let v = v.trim();
1212 !(v.is_empty()
1213 || v == "0"
1214 || v.eq_ignore_ascii_case("off")
1215 || v.eq_ignore_ascii_case("false"))
1216 })
1217 .unwrap_or(true);
1218 let (fresh_merge_admitted, form_dom_match_failures, form_dom_match_log) =
1219 if let Some(fxml) = form_xml {
1220 apply_form_dom_presence(&mut tree, root_id, fxml, policy, admit_databound_override)
1221 } else {
1222 (0, 0, Vec::new())
1223 };
1224
1225 let resolved_fonts = resolve_template_fonts(template_xml, pdf_bytes);
1228 inject_resolved_metrics(&mut tree, &resolved_fonts);
1229
1230 debug_assert!(
1232 _stage <= PipelineStage::Layout,
1233 "pipeline stage order violated: expected <= Layout"
1234 );
1235 _stage = PipelineStage::Layout;
1236
1237 let engine = LayoutEngine::new(&tree);
1238 let (mut layout, mut layout_dump) = if collect_layout_dump {
1239 let (layout, profile) = engine
1240 .layout_with_profile(root_id)
1241 .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
1242 (layout, Some(layout_dump_from_profile(profile)))
1243 } else {
1244 let layout = engine
1245 .layout(root_id)
1246 .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
1247 (layout, None)
1248 };
1249
1250 if layout.pages.is_empty() {
1251 return Err(XfaError::LayoutFailed("layout produced 0 pages".into()));
1252 }
1253
1254 log::debug!("XFA layout: {} pages produced", layout.pages.len());
1255
1256 let trace_pages_produced = layout.pages.len();
1268 let trace_suppression = if flatten_trace::enabled() {
1271 compute_suppression_diags(&layout, &tree, pre_js_nonempty_fields.as_ref())
1272 } else {
1273 Vec::new()
1274 };
1275 let trust_layout = suppression_trust_layout_enabled();
1279 if layout.pages.len() > 1 {
1280 let keep: Vec<bool> = layout
1281 .pages
1282 .iter()
1283 .map(|p| {
1284 if p.runtime_instantiated {
1289 true
1290 } else if page_has_fields(&p.nodes, &tree) {
1291 let has_field_data = match pre_js_nonempty_fields {
1302 Some(ref snap) => page_has_field_data_snapshot(&p.nodes, snap),
1303 None => page_has_field_data(&p.nodes, &tree),
1304 };
1305 has_field_data || (trust_layout && page_has_visible_content(&p.nodes))
1306 } else {
1307 true
1308 }
1309 })
1310 .collect();
1311 let any_keep = keep.iter().any(|&k| k);
1312 if any_keep {
1313 let mut idx = 0;
1314 layout.pages.retain(|_| {
1315 let k = keep[idx];
1316 idx += 1;
1317 k
1318 });
1319 if let Some(ref mut dump) = layout_dump {
1320 let mut idx = 0;
1321 dump.pages.retain(|_| {
1322 let k = keep[idx];
1323 idx += 1;
1324 k
1325 });
1326 }
1327 }
1328 }
1332
1333 if let Some(ref mut dump) = layout_dump {
1334 renumber_layout_dump_pages(dump);
1335 }
1336
1337 debug_assert!(
1339 _stage <= PipelineStage::Render,
1340 "pipeline stage order violated: expected <= Render"
1341 );
1342 _stage = PipelineStage::Render;
1343
1344 let mut doc = match Document::load_mem(pdf_bytes) {
1345 Ok(d) => d,
1346 Err(_) => {
1347 eprintln!("lopdf load failed, creating minimal PDF structure for XFA layout");
1348 create_minimal_pdf_document()
1349 }
1350 };
1351
1352 debug_assert!(
1354 _stage <= PipelineStage::Embed,
1355 "pipeline stage order violated: expected <= Embed"
1356 );
1357 _stage = PipelineStage::Embed;
1358
1359 let (font_map, embedded_font_objects, metrics_data) =
1366 embed_resolved_fonts(&mut doc, &resolved_fonts, &layout);
1367
1368 let config = XfaRenderConfig {
1369 font_map: std::sync::Arc::new(font_map),
1370 font_metrics_data: std::sync::Arc::new(metrics_data),
1371 ..Default::default()
1372 };
1373
1374 let overlays = generate_all_overlays(&layout, &config)
1375 .map_err(|e| XfaError::LayoutFailed(format!("overlay generation: {e:?}")))?;
1376
1377 log::debug!(
1378 "XFA render: {} content streams generated ({} bytes total)",
1379 overlays.len(),
1380 overlays
1381 .iter()
1382 .map(|o| o.content_stream.len())
1383 .sum::<usize>()
1384 );
1385
1386 let font_ids: [ObjectId; 3] = [
1388 doc.add_object(Object::Dictionary(dictionary! {
1389 "Type" => Object::Name(b"Font".to_vec()),
1390 "Subtype" => Object::Name(b"Type1".to_vec()),
1391 "BaseFont" => Object::Name(b"Times-Roman".to_vec()),
1392 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1393 })),
1394 doc.add_object(Object::Dictionary(dictionary! {
1395 "Type" => Object::Name(b"Font".to_vec()),
1396 "Subtype" => Object::Name(b"Type1".to_vec()),
1397 "BaseFont" => Object::Name(b"Helvetica".to_vec()),
1398 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1399 })),
1400 doc.add_object(Object::Dictionary(dictionary! {
1401 "Type" => Object::Name(b"Font".to_vec()),
1402 "Subtype" => Object::Name(b"Type1".to_vec()),
1403 "BaseFont" => Object::Name(b"Courier".to_vec()),
1404 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1405 })),
1406 ];
1407
1408 let existing_page_ids: Vec<ObjectId> = doc.page_iter().collect();
1409 let n_layout = overlays.len();
1410 let n_existing = existing_page_ids.len();
1411
1412 let is_static_form = template_xml.contains("baseProfile=\"interactiveForms\"");
1425 let has_static_content = pages_have_static_content(&doc);
1426
1427 let overlay_is_substantial = overlays.iter().any(|o| o.content_stream.len() > 1000);
1451 let preserve_static =
1458 is_static_form || n_layout < n_existing || has_static_content && overlay_is_substantial;
1459
1460 debug_assert!(
1462 _stage <= PipelineStage::Write,
1463 "pipeline stage order violated: expected <= Write"
1464 );
1465 _stage = PipelineStage::Write;
1466
1467 let mut trace_widgets_baked = 0usize;
1468 let mut trace_excess_deleted = 0usize;
1469 if preserve_static {
1470 let baked = flatten_widget_appearances(&mut doc);
1471 trace_widgets_baked = baked;
1472 if baked == 0 {
1473 if let Ok(fv_overlays) = generate_field_values_overlays(&layout, &config) {
1479 for (i, overlay) in fv_overlays.iter().enumerate() {
1480 if i < n_existing && !overlay.content_stream.is_empty() {
1481 let _ = overlay_page_content(
1482 &mut doc,
1483 existing_page_ids[i],
1484 overlay,
1485 &font_ids,
1486 &embedded_font_objects,
1487 );
1488 }
1489 }
1490 }
1491 }
1492 } else {
1497 for (i, overlay) in overlays.iter().enumerate() {
1505 if i < n_existing {
1506 let lp = &layout.pages[i];
1507 write_page_content(
1508 &mut doc,
1509 existing_page_ids[i],
1510 overlay,
1511 &font_ids,
1512 &embedded_font_objects,
1513 Some(lp.width),
1514 Some(lp.height),
1515 )?;
1516 } else {
1517 let lp = &layout.pages[i];
1518 add_new_page(
1519 &mut doc,
1520 lp.width,
1521 lp.height,
1522 overlay,
1523 &font_ids,
1524 &embedded_font_objects,
1525 )?;
1526 }
1527 }
1528
1529 for &page_id in &existing_page_ids[..n_existing.min(n_layout)] {
1534 bake_checkbox_radio_ap_marks(&mut doc, page_id);
1535 }
1536 }
1537
1538 if n_layout < n_existing && !preserve_static {
1546 let excess: Vec<u32> = ((n_layout + 1) as u32..=(n_existing as u32))
1549 .rev()
1550 .collect();
1551 trace_excess_deleted = excess.len();
1552 doc.delete_pages(&excess);
1553 }
1554
1555 if is_static_form {
1556 for &page_id in &existing_page_ids {
1562 strip_widget_annotations(&mut doc, page_id);
1563 }
1564 } else {
1565 for &page_id in existing_page_ids.iter().take(n_layout.min(n_existing)) {
1568 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
1569 dict.remove(b"Annots");
1570 }
1571 }
1572 }
1573
1574 debug_assert!(
1576 _stage <= PipelineStage::Cleanup,
1577 "pipeline stage order violated: expected <= Cleanup"
1578 );
1579 #[allow(unused_assignments)]
1580 {
1581 _stage = PipelineStage::Cleanup;
1582 }
1583
1584 remove_acroform(&mut doc);
1585 let stripped_js = javascript_policy::strip_javascript_for_flatten(&mut doc);
1586 if stripped_js > 0 {
1587 log::warn!("stripped {stripped_js} JavaScript action(s) from flattened output");
1588 }
1589
1590 let trace_ctx = if flatten_trace::enabled() {
1593 let (acroform_removed, xfa_removed_structural, needs_rendering_removed) =
1594 catalog_cleanup_status(&doc);
1595 Some((
1596 acroform_removed,
1597 xfa_removed_structural,
1598 needs_rendering_removed,
1599 doc.page_iter().count(),
1600 layout
1601 .pages
1602 .iter()
1603 .filter(|p| p.runtime_instantiated)
1604 .count(),
1605 ))
1606 } else {
1607 None
1608 };
1609
1610 let mut out = Vec::new();
1611 doc.save_to(&mut out)
1612 .map_err(|e| XfaError::LayoutFailed(format!("save: {e}")))?;
1613
1614 if let Some((
1615 acroform_removed,
1616 xfa_removed_structural,
1617 needs_rendering_removed,
1618 output_page_count,
1619 runtime_pages,
1620 )) = trace_ctx
1621 {
1622 let js_mode =
1623 std::env::var("XFA_JS_EXECUTION_MODE").unwrap_or_else(|_| "best_effort_static".into());
1624 flatten_trace::emit(&flatten_trace::TraceInputs {
1625 suppression: &trace_suppression,
1626 input_bytes: pdf_bytes.len(),
1627 template_bytes: template_xml.len(),
1628 js_execution_mode: &js_mode,
1629 flatten_path: if preserve_static {
1630 "static_preserve"
1631 } else {
1632 "dynamic"
1633 },
1634 template_packet_found: true,
1635 datasets_packet_found: datasets_xml.is_some(),
1636 form_packet_found: form_xml.is_some(),
1637 image_files: trace_image_files,
1638 tree: &tree,
1639 scripts: &dynamic_scripts,
1640 layout: &layout,
1641 pages_produced: trace_pages_produced,
1642 pages_after_suppression: layout.pages.len(),
1643 runtime_instantiated_pages: runtime_pages,
1644 overlays: &overlays,
1645 n_layout,
1646 n_existing,
1647 is_static_form,
1648 has_static_content,
1649 preserve_static,
1650 excess_pages_deleted: trace_excess_deleted,
1651 widgets_baked: trace_widgets_baked,
1652 acroform_removed,
1653 xfa_removed_structural,
1654 needs_rendering_removed,
1655 javascript_actions_stripped: stripped_js,
1656 output_bytes: out.len(),
1657 output_page_count,
1658 });
1659 }
1660
1661 let mut dynamic_scripts = dynamic_scripts;
1663 dynamic_scripts.form_dom_match_failures = form_dom_match_failures;
1664 dynamic_scripts.form_dom_match_log = form_dom_match_log;
1665
1666 let mut flatten_out = FlattenOutput::new(out, layout_dump.unwrap_or_default(), dynamic_scripts);
1667 flatten_out.metadata.fresh_merge_admitted_nodes = fresh_merge_admitted;
1668 Ok(flatten_out)
1669}
1670
1671fn catalog_cleanup_status(doc: &Document) -> (bool, bool, bool) {
1675 let root_id = match doc.trailer.get(b"Root") {
1676 Ok(Object::Reference(id)) => *id,
1677 _ => return (true, true, true),
1678 };
1679 let Ok(cat) = doc.get_dictionary(root_id) else {
1680 return (true, true, true);
1681 };
1682 let acroform_present = cat.get(b"AcroForm").is_ok();
1683 let needs_rendering_present = cat.get(b"NeedsRendering").is_ok();
1684 let direct_xfa = cat.get(b"XFA").is_ok();
1685 let acroform_xfa = cat
1686 .get(b"AcroForm")
1687 .ok()
1688 .and_then(|o| match o {
1689 Object::Reference(id) => doc.get_dictionary(*id).ok(),
1690 Object::Dictionary(d) => Some(d),
1691 _ => None,
1692 })
1693 .map(|d| d.get(b"XFA").is_ok())
1694 .unwrap_or(false);
1695 (
1696 !acroform_present,
1697 !(direct_xfa || acroform_xfa),
1698 !needs_rendering_present,
1699 )
1700}
1701
1702fn layout_dump_from_profile(profile: LayoutProfile) -> LayoutDump {
1703 LayoutDump {
1704 pages: profile
1705 .pages
1706 .into_iter()
1707 .enumerate()
1708 .map(|(idx, page)| LayoutDumpEntry {
1709 page_num: idx as u32 + 1,
1710 page_height: page.page_height,
1711 used_height: page.used_height,
1712 overflow_to_next: page.overflow_to_next,
1713 first_overflow_element: page.first_overflow_element,
1714 })
1715 .collect(),
1716 ..Default::default()
1717 }
1718}
1719
1720fn renumber_layout_dump_pages(dump: &mut LayoutDump) {
1721 for (idx, page) in dump.pages.iter_mut().enumerate() {
1722 page.page_num = idx as u32 + 1;
1723 }
1724}
1725
1726fn extract_embedded_images(doc: &Document) -> HashMap<String, Vec<u8>> {
1736 let mut images = HashMap::new();
1737
1738 fn deref_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
1740 match obj {
1741 Object::Reference(id) => doc.get_dictionary(*id).ok(),
1742 Object::Dictionary(d) => Some(d),
1743 _ => None,
1744 }
1745 }
1746
1747 fn extract_stream(doc: &Document, obj: &Object) -> Option<Vec<u8>> {
1749 let stream_obj = match obj {
1750 Object::Reference(id) => doc.get_object(*id).ok()?,
1751 other => other,
1752 };
1753 if let Object::Stream(ref stream) = *stream_obj {
1754 let mut s = stream.clone();
1755 let _ = s.decompress();
1756 Some(s.content.clone())
1757 } else {
1758 None
1759 }
1760 }
1761
1762 let catalog = match doc.catalog() {
1764 Ok(c) => c,
1765 Err(_) => return images,
1766 };
1767 let names_obj = match catalog.get(b"Names") {
1768 Ok(obj) => obj,
1769 Err(_) => {
1770 eprintln!("[img-href] no /Names in catalog");
1771 return images;
1772 }
1773 };
1774 let names_dict = match deref_dict(doc, names_obj) {
1775 Some(d) => d,
1776 None => return images,
1777 };
1778 let ef_obj = match names_dict
1781 .get(b"XFAImages")
1782 .or_else(|_| names_dict.get(b"EmbeddedFiles"))
1783 {
1784 Ok(obj) => obj,
1785 Err(_) => return images,
1786 };
1787 let ef_dict = match deref_dict(doc, ef_obj) {
1788 Some(d) => d,
1789 None => return images,
1790 };
1791
1792 let names_arr_obj = match ef_dict.get(b"Names") {
1794 Ok(obj) => obj,
1795 Err(_) => return images,
1796 };
1797 let names_array = match names_arr_obj {
1798 Object::Array(arr) => arr,
1799 Object::Reference(id) => match doc.get_object(*id) {
1800 Ok(Object::Array(arr)) => arr,
1801 _ => return images,
1802 },
1803 _ => return images,
1804 };
1805
1806 let mut i = 0;
1808 while i + 1 < names_array.len() {
1809 let name = match &names_array[i] {
1810 Object::String(bytes, _) => String::from_utf8_lossy(bytes).to_string(),
1811 _ => {
1812 i += 2;
1813 continue;
1814 }
1815 };
1816
1817 let value_ref = &names_array[i + 1];
1821
1822 if let Some(filespec) = deref_dict(doc, value_ref) {
1824 if let Ok(ef_obj) = filespec.get(b"EF") {
1825 if let Some(ef) = deref_dict(doc, ef_obj) {
1826 if let Ok(f_ref) = ef.get(b"F") {
1827 if let Some(data) = extract_stream(doc, f_ref) {
1828 images.insert(name.clone(), data);
1829 i += 2;
1830 continue;
1831 }
1832 }
1833 }
1834 }
1835 }
1836
1837 if let Some(data) = extract_stream(doc, value_ref) {
1839 images.insert(name.clone(), data);
1840 }
1841
1842 i += 2;
1843 }
1844 images
1845}
1846
1847#[doc(hidden)]
1865pub fn extract_embedded_fonts(doc: &Document) -> Vec<EmbeddedFontData> {
1866 let mut fonts = Vec::new();
1867 let mut seen = std::collections::HashSet::new();
1868 for (&font_object_id, obj) in &doc.objects {
1869 let dict = match obj.as_dict() {
1870 Ok(d) => d,
1871 Err(_) => continue,
1872 };
1873 let is_font =
1874 dict.get(b"Type").ok().and_then(|o| o.as_name().ok()) == Some(b"Font".as_slice());
1875 if !is_font {
1876 continue;
1877 }
1878 let base_font = match dict.get(b"BaseFont").ok().and_then(|o| o.as_name().ok()) {
1879 Some(n) => String::from_utf8_lossy(n).to_string(),
1880 None => continue,
1881 };
1882
1883 let pdf_widths = extract_font_widths(dict);
1884 let pdf_encoding = extract_font_encoding(doc, dict);
1885 let pdf_source_font =
1886 extract_simple_pdf_source_font(doc, font_object_id, dict, pdf_widths.as_ref());
1887
1888 if let Some((stream_id, data)) = extract_font_from_direct_fd(doc, dict, &base_font) {
1890 if seen.insert(stream_id) {
1891 store_font_data(
1892 &mut fonts,
1893 &base_font,
1894 data,
1895 pdf_widths.clone(),
1896 pdf_encoding.clone(),
1897 pdf_source_font,
1898 );
1899 }
1900 continue;
1901 }
1902
1903 if let Some((stream_id, data)) = extract_cidfont_data(doc, dict, &base_font, &seen) {
1907 if seen.insert(stream_id) {
1908 let cid_widths = extract_cid_font_widths(doc, dict);
1909 store_font_data(&mut fonts, &base_font, data, cid_widths, None, None);
1910 }
1911 continue;
1912 }
1913
1914 if let Some(source_font) = pdf_source_font {
1915 store_font_data(
1921 &mut fonts,
1922 &base_font,
1923 Vec::new(),
1924 pdf_widths.clone(),
1925 pdf_encoding.clone(),
1926 Some(source_font),
1927 );
1928 }
1929 }
1930 fonts
1931}
1932
1933fn extract_font_widths(dict: &lopdf::Dictionary) -> Option<(u16, Vec<u16>)> {
1935 let first_char = dict.get(b"FirstChar").ok()?.as_i64().ok()? as u16;
1936 let _last_char = dict.get(b"LastChar").ok()?.as_i64().ok()? as u16;
1937 let widths_array = dict.get(b"Widths").ok()?.as_array().ok()?;
1938 let widths: Vec<u16> = widths_array
1939 .iter()
1940 .filter_map(|w| w.as_i64().ok().map(|v| v as u16))
1941 .collect();
1942 if widths.is_empty() {
1943 return None;
1944 }
1945 Some((first_char, widths))
1946}
1947
1948fn extract_cid_font_widths(
1965 doc: &Document,
1966 type0_dict: &lopdf::Dictionary,
1967) -> Option<(u16, Vec<u16>)> {
1968 let descendants = type0_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
1969 let desc_ref = descendants.first()?;
1970 let cid_dict = match desc_ref {
1971 Object::Reference(id) => doc.get_dictionary(*id).ok()?,
1972 Object::Dictionary(d) => d,
1973 _ => return None,
1974 };
1975
1976 let default_width = cid_dict
1977 .get(b"DW")
1978 .ok()
1979 .and_then(|o| o.as_i64().ok())
1980 .unwrap_or(1000) as u16;
1981
1982 let w_array = cid_dict.get(b"W").ok()?;
1983 let w_array = match resolve_object(doc, w_array) {
1984 Some(obj) => obj.as_array().ok()?,
1985 None => return None,
1986 };
1987
1988 if w_array.is_empty() {
1989 return None;
1990 }
1991
1992 let mut entries: Vec<(u16, u16)> = Vec::new();
1994 let mut i = 0;
1995 while i < w_array.len() {
1996 let cid_start = match w_array[i].as_i64() {
1997 Ok(v) => v as u16,
1998 Err(_) => {
1999 i += 1;
2000 continue;
2001 }
2002 };
2003 i += 1;
2004 if i >= w_array.len() {
2005 break;
2006 }
2007
2008 if let Ok(widths_arr) = w_array[i].as_array() {
2010 for (j, w_obj) in widths_arr.iter().enumerate() {
2012 if let Ok(w) = w_obj.as_i64() {
2013 entries.push((cid_start + j as u16, w as u16));
2014 }
2015 }
2016 i += 1;
2017 } else if let Ok(cid_last) = w_array[i].as_i64() {
2018 i += 1;
2020 if i >= w_array.len() {
2021 break;
2022 }
2023 if let Ok(width) = w_array[i].as_i64() {
2024 let cid_last = cid_last as u16;
2025 for cid in cid_start..=cid_last {
2026 entries.push((cid, width as u16));
2027 }
2028 }
2029 i += 1;
2030 } else {
2031 i += 1;
2032 }
2033 }
2034
2035 if entries.is_empty() {
2036 return None;
2037 }
2038
2039 let min_cid = entries
2041 .iter()
2042 .map(|(c, _)| *c)
2043 .min()
2044 .expect("entries is non-empty");
2045 let max_cid = entries
2046 .iter()
2047 .map(|(c, _)| *c)
2048 .max()
2049 .expect("entries is non-empty");
2050 let len = (max_cid - min_cid + 1) as usize;
2051 let mut widths = vec![default_width; len];
2052 for (cid, w) in &entries {
2053 widths[(*cid - min_cid) as usize] = *w;
2054 }
2055
2056 Some((min_cid, widths))
2057}
2058
2059fn extract_font_encoding(doc: &Document, dict: &lopdf::Dictionary) -> Option<PdfSimpleEncoding> {
2071 let encoding_obj = resolve_object(doc, dict.get(b"Encoding").ok()?)?;
2072 let encoding_dict = encoding_obj.as_dict().ok()?;
2073 let differences_array = resolve_object(doc, encoding_dict.get(b"Differences").ok()?)?
2074 .as_array()
2075 .ok()?;
2076
2077 let base_encoding = encoding_dict
2078 .get(b"BaseEncoding")
2079 .ok()
2080 .and_then(|obj| resolve_object(doc, obj))
2081 .and_then(|obj| obj.as_name().ok())
2082 .and_then(PdfBaseEncoding::from_pdf_name)
2083 .unwrap_or(PdfBaseEncoding::WinAnsi);
2084
2085 let mut differences = Vec::new();
2086 let mut current_code: Option<u8> = None;
2087 for item in differences_array {
2088 let item = resolve_object(doc, item)?;
2089 if let Ok(code) = item.as_i64() {
2090 current_code = u8::try_from(code).ok();
2091 continue;
2092 }
2093
2094 let Some(name) = item.as_name().ok() else {
2095 continue;
2096 };
2097 let Some(code) = current_code else {
2098 continue;
2099 };
2100 let Some(glyph_name) = std::str::from_utf8(name).ok() else {
2101 continue;
2102 };
2103 if let Some(unicode) = pdf_glyph_name_to_unicode(glyph_name) {
2104 differences.push((code, unicode));
2105 }
2106 current_code = code.checked_add(1);
2107 }
2108
2109 if differences.is_empty() {
2110 return None;
2111 }
2112
2113 Some(PdfSimpleEncoding {
2114 base_encoding,
2115 differences,
2116 })
2117}
2118
2119fn extract_simple_pdf_source_font(
2120 doc: &Document,
2121 font_object_id: ObjectId,
2122 dict: &lopdf::Dictionary,
2123 pdf_widths: Option<&(u16, Vec<u16>)>,
2124) -> Option<PdfSourceFont> {
2125 pdf_widths?;
2126
2127 let subtype = dict.get(b"Subtype").ok().and_then(|obj| obj.as_name().ok());
2128 if subtype == Some(b"Type0".as_slice()) {
2129 return None;
2130 }
2131
2132 let encoding_obj = dict
2141 .get(b"Encoding")
2142 .ok()
2143 .and_then(|obj| resolve_object(doc, obj));
2144 match encoding_obj {
2145 Some(obj) if obj.as_name().ok() == Some(b"WinAnsiEncoding".as_slice()) => {}
2146 Some(obj) => {
2147 let base = obj
2148 .as_dict()
2149 .ok()
2150 .and_then(|enc| enc.get(b"BaseEncoding").ok())
2151 .and_then(|base| resolve_object(doc, base))
2152 .and_then(|base| base.as_name().ok());
2153 if base != Some(b"WinAnsiEncoding".as_slice()) {
2154 return None;
2155 }
2156 if obj
2157 .as_dict()
2158 .ok()
2159 .and_then(|enc| enc.get(b"Differences").ok())
2160 .is_some()
2161 {
2162 return None;
2163 }
2164 }
2165 None => return None,
2166 }
2167
2168 Some(PdfSourceFont {
2169 object_id: font_object_id,
2170 })
2171}
2172
2173fn resolve_object<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
2174 match obj {
2175 Object::Reference(id) => doc.get_object(*id).ok(),
2176 other => Some(other),
2177 }
2178}
2179
2180fn extract_font_from_direct_fd(
2182 doc: &Document,
2183 font_dict: &lopdf::Dictionary,
2184 _base_font: &str,
2185) -> Option<(lopdf::ObjectId, Vec<u8>)> {
2186 let fd_id = font_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
2187 let fd = doc.get_dictionary(fd_id).ok()?;
2188
2189 let font_stream_id = fd
2190 .get(b"FontFile2")
2191 .or_else(|_| fd.get(b"FontFile3"))
2192 .or_else(|_| fd.get(b"FontFile"))
2193 .ok()?
2194 .as_reference()
2195 .ok()?;
2196
2197 let stream = doc
2198 .get_object(font_stream_id)
2199 .and_then(|o| o.as_stream())
2200 .ok()?;
2201
2202 let data = stream
2203 .get_plain_content()
2204 .unwrap_or_else(|_| stream.content.clone());
2205
2206 if data.is_empty() {
2207 return None;
2208 }
2209
2210 Some((font_stream_id, data))
2211}
2212
2213fn extract_cidfont_data(
2218 doc: &Document,
2219 font_dict: &lopdf::Dictionary,
2220 _base_font: &str,
2221 seen: &std::collections::HashSet<lopdf::ObjectId>,
2222) -> Option<(lopdf::ObjectId, Vec<u8>)> {
2223 let descendants = font_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
2225
2226 for desc_ref in descendants {
2228 let desc_id = desc_ref.as_reference().ok()?;
2229 let desc_dict = doc.get_dictionary(desc_id).ok()?;
2230
2231 let fd_id = desc_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
2233 let fd = doc.get_dictionary(fd_id).ok()?;
2234
2235 let font_stream_id = fd
2237 .get(b"FontFile3")
2238 .or_else(|_| fd.get(b"FontFile2"))
2239 .or_else(|_| fd.get(b"FontFile"))
2240 .ok()?
2241 .as_reference()
2242 .ok()?;
2243
2244 if seen.contains(&font_stream_id) {
2245 continue;
2246 }
2247
2248 let stream = doc
2249 .get_object(font_stream_id)
2250 .and_then(|o| o.as_stream())
2251 .ok()?;
2252
2253 let data = stream
2254 .get_plain_content()
2255 .unwrap_or_else(|_| stream.content.clone());
2256
2257 if !data.is_empty() {
2258 return Some((font_stream_id, data));
2259 }
2260 }
2261 None
2262}
2263
2264fn store_font_data(
2266 fonts: &mut Vec<EmbeddedFontData>,
2267 base_font: &str,
2268 data: Vec<u8>,
2269 pdf_widths: Option<(u16, Vec<u16>)>,
2270 pdf_encoding: Option<PdfSimpleEncoding>,
2271 pdf_source_font: Option<PdfSourceFont>,
2272) {
2273 let clean_name = if let Some(pos) = base_font.find('+') {
2274 base_font[pos + 1..].to_string()
2275 } else {
2276 base_font.to_string()
2277 };
2278 let allow_family_alias = family_alias_is_regular_face(&clean_name, &data);
2279
2280 fonts.push(EmbeddedFontData {
2282 name: clean_name.clone(),
2283 data: data.clone(),
2284 pdf_widths: pdf_widths.clone(),
2285 pdf_encoding: pdf_encoding.clone(),
2286 pdf_source_font,
2287 });
2288
2289 if let Ok(face) = ttf_parser::Face::parse(&data, 0) {
2293 for name_record in face.names() {
2294 let allow_alias = match name_record.name_id {
2295 ttf_parser::name_id::FAMILY => allow_family_alias,
2296 ttf_parser::name_id::FULL_NAME | ttf_parser::name_id::POST_SCRIPT_NAME => true,
2297 _ => false,
2298 };
2299 if !allow_alias {
2300 continue;
2301 }
2302 if let Some(alias) = name_record.to_string() {
2303 if alias != clean_name {
2304 fonts.push(EmbeddedFontData {
2305 name: alias,
2306 data: data.clone(),
2307 pdf_widths: pdf_widths.clone(),
2308 pdf_encoding: pdf_encoding.clone(),
2309 pdf_source_font,
2310 });
2311 }
2312 }
2313 }
2314 }
2315
2316 let normalized = ps_name_to_family(&clean_name);
2320 if allow_family_alias && normalized != clean_name {
2321 fonts.push(EmbeddedFontData {
2322 name: normalized,
2323 data,
2324 pdf_widths,
2325 pdf_encoding,
2326 pdf_source_font,
2327 });
2328 }
2329}
2330
2331fn family_alias_is_regular_face(clean_name: &str, data: &[u8]) -> bool {
2332 if let Ok(face) = ttf_parser::Face::parse(data, 0) {
2333 if face.is_bold() || face.is_italic() {
2334 return false;
2335 }
2336 }
2337
2338 let lower = clean_name.to_ascii_lowercase();
2339 !lower.contains("bold") && !lower.contains("italic") && !lower.contains("oblique")
2340}
2341
2342fn ps_name_to_family(ps_name: &str) -> String {
2347 let base = ps_name
2349 .strip_suffix("PSMT")
2350 .or_else(|| ps_name.strip_suffix("PS-BoldItalicMT"))
2351 .or_else(|| ps_name.strip_suffix("PS-BoldMT"))
2352 .or_else(|| ps_name.strip_suffix("PS-ItalicMT"))
2353 .or_else(|| ps_name.strip_suffix("-BoldItalicMT"))
2354 .or_else(|| ps_name.strip_suffix("-BoldMT"))
2355 .or_else(|| ps_name.strip_suffix("-ItalicMT"))
2356 .or_else(|| ps_name.strip_suffix("MT"))
2357 .or_else(|| ps_name.strip_suffix("-Regular"))
2358 .or_else(|| ps_name.strip_suffix("-Bold"))
2359 .or_else(|| ps_name.strip_suffix("-Italic"))
2360 .or_else(|| ps_name.strip_suffix("-BoldItalic"))
2361 .unwrap_or(ps_name);
2362 let mut result = String::with_capacity(base.len() + 4);
2365 for (i, ch) in base.chars().enumerate() {
2366 if i > 0 && ch.is_uppercase() {
2367 let prev = base.as_bytes()[i - 1] as char;
2368 if prev.is_lowercase() {
2369 result.push(' ');
2370 }
2371 }
2372 result.push(ch);
2373 }
2374 result
2375}
2376
2377struct TemplateFontEntry {
2379 typeface: String,
2380 weight: Option<String>,
2381 posture: Option<String>,
2382 generic_family: Option<String>,
2383}
2384
2385fn collect_template_font_entries(template_xml: &str) -> Vec<TemplateFontEntry> {
2386 let mut entries = Vec::new();
2387 let mut seen = std::collections::HashSet::new();
2388 if let Ok(xml_doc) = roxmltree::Document::parse(template_xml) {
2389 for node in xml_doc.descendants() {
2390 if node.tag_name().name() == "font" {
2391 if let Some(typeface) = node.attribute("typeface") {
2392 let name = typeface.to_string();
2393 let weight = node.attribute("weight").map(|s| s.to_string());
2394 let posture = node.attribute("posture").map(|s| s.to_string());
2395 let generic_family = node.attribute("genericFamily").map(|s| s.to_string());
2396 let key = font_variant_key(&name, weight.as_deref(), posture.as_deref());
2397 if !name.is_empty() && seen.insert(key.to_lowercase()) {
2398 entries.push(TemplateFontEntry {
2399 typeface: name,
2400 weight,
2401 posture,
2402 generic_family,
2403 });
2404 }
2405 }
2406 }
2407 }
2408 }
2409 entries
2410}
2411
2412fn embed_font_in_pdf(doc: &mut Document, font: &ResolvedFont) -> ObjectId {
2413 let font_stream = Stream::new(
2414 dictionary! {
2415 "Length" => Object::Integer(font.data.len() as i64),
2416 "Length1" => Object::Integer(font.data.len() as i64)
2417 },
2418 font.data.clone(),
2419 );
2420 let font_file_id = doc.add_object(Object::Stream(font_stream));
2421
2422 let upem = font.units_per_em as f64;
2423 let scale = 1000.0 / upem.max(1.0);
2424 let ascent = (font.ascender as f64 * scale) as i64;
2425 let descent = (font.descender as f64 * scale) as i64;
2426 let cap_height = (ascent as f64 * 0.7) as i64;
2427 let base_name = font.name.replace(' ', "-");
2428
2429 let fd = dictionary! {
2430 "Type" => Object::Name(b"FontDescriptor".to_vec()),
2431 "FontName" => Object::Name(base_name.as_bytes().to_vec()),
2432 "Flags" => Object::Integer(32),
2433 "FontBBox" => Object::Array(vec![
2434 Object::Integer(0),
2435 Object::Integer(descent),
2436 Object::Integer(1000),
2437 Object::Integer(ascent),
2438 ]),
2439 "ItalicAngle" => Object::Integer(0),
2440 "Ascent" => Object::Integer(ascent),
2441 "Descent" => Object::Integer(descent),
2442 "CapHeight" => Object::Integer(cap_height),
2443 "StemV" => Object::Integer(80),
2444 "FontFile2" => Object::Reference(font_file_id)
2445 };
2446 let fd_id = doc.add_object(Object::Dictionary(fd));
2447
2448 let cid_info = font.cid_font_info().unwrap_or(CidFontInfo {
2450 widths: vec![500],
2451 gid_to_unicode: vec![],
2452 });
2453
2454 let widths_inner: Vec<Object> = cid_info
2456 .widths
2457 .iter()
2458 .map(|&w| Object::Integer(w as i64))
2459 .collect();
2460 let w_array = vec![Object::Integer(0), Object::Array(widths_inner)];
2461
2462 let cid_font = dictionary! {
2463 "Type" => Object::Name(b"Font".to_vec()),
2464 "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
2465 "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
2466 "CIDSystemInfo" => Object::Dictionary(dictionary! {
2467 "Registry" => Object::String(b"Adobe".to_vec(), StringFormat::Literal),
2468 "Ordering" => Object::String(b"Identity".to_vec(), StringFormat::Literal),
2469 "Supplement" => Object::Integer(0)
2470 }),
2471 "FontDescriptor" => Object::Reference(fd_id),
2472 "W" => Object::Array(w_array),
2473 "CIDToGIDMap" => Object::Name(b"Identity".to_vec())
2474 };
2475 let cid_font_id = doc.add_object(Object::Dictionary(cid_font));
2476
2477 let tounicode_data = generate_tounicode_cmap(&cid_info.gid_to_unicode);
2479 let tounicode_stream = Stream::new(
2480 dictionary! { "Length" => Object::Integer(tounicode_data.len() as i64) },
2481 tounicode_data,
2482 );
2483 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
2484
2485 let type0_font = dictionary! {
2487 "Type" => Object::Name(b"Font".to_vec()),
2488 "Subtype" => Object::Name(b"Type0".to_vec()),
2489 "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
2490 "Encoding" => Object::Name(b"Identity-H".to_vec()),
2491 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
2492 "ToUnicode" => Object::Reference(tounicode_id)
2493 };
2494 doc.add_object(Object::Dictionary(type0_font))
2495}
2496
2497fn generate_tounicode_cmap(gid_to_unicode: &[(u16, char)]) -> Vec<u8> {
2499 let mut cmap = String::with_capacity(gid_to_unicode.len() * 24 + 256);
2500 cmap.push_str("/CIDInit /ProcSet findresource begin\n");
2501 cmap.push_str("12 dict begin\n");
2502 cmap.push_str("begincmap\n");
2503 cmap.push_str("/CIDSystemInfo\n");
2504 cmap.push_str("<< /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def\n");
2505 cmap.push_str("/CMapName /Adobe-Identity-UCS def\n");
2506 cmap.push_str("/CMapType 2 def\n");
2507 cmap.push_str("1 begincodespacerange\n");
2508 cmap.push_str("<0000> <FFFF>\n");
2509 cmap.push_str("endcodespacerange\n");
2510 for chunk in gid_to_unicode.chunks(100) {
2511 let _ = writeln!(cmap, "{} beginbfchar", chunk.len());
2512 for &(gid, ch) in chunk {
2513 let _ = writeln!(cmap, "<{:04X}> <{:04X}>", gid, ch as u32);
2514 }
2515 cmap.push_str("endbfchar\n");
2516 }
2517 cmap.push_str("endcmap\n");
2518 cmap.push_str("CMapName currentdict /CMap defineresource pop\n");
2519 cmap.push_str("end\nend\n");
2520 cmap.into_bytes()
2521}
2522
2523fn resolve_template_fonts(template_xml: &str, pdf_bytes: &[u8]) -> HashMap<String, ResolvedFont> {
2530 let mut resolved = HashMap::new();
2531 let entries = collect_template_font_entries(template_xml);
2532 if entries.is_empty() {
2533 return resolved;
2534 }
2535 let source_doc = match Document::load_mem(pdf_bytes) {
2536 Ok(d) => d,
2537 Err(_) => return resolved,
2538 };
2539 let embedded_fonts = extract_embedded_fonts(&source_doc);
2540 let mut resolver = XfaFontResolver::new(embedded_fonts);
2541 for entry in &entries {
2542 let spec = XfaFontSpec::from_xfa_attrs(
2543 &entry.typeface,
2544 entry.weight.as_deref(),
2545 entry.posture.as_deref(),
2546 None,
2547 entry.generic_family.as_deref(),
2548 );
2549 let key = font_variant_key(
2550 &entry.typeface,
2551 entry.weight.as_deref(),
2552 entry.posture.as_deref(),
2553 );
2554 match resolver.resolve(&spec) {
2555 Ok(font) => {
2556 resolved.insert(key, font);
2557 }
2558 Err(e) => {
2559 eprintln!("Font resolution failed for '{}': {}", entry.typeface, e);
2560 }
2561 }
2562 }
2563 resolved
2564}
2565
2566fn inject_resolved_metrics(
2575 tree: &mut xfa_layout_engine::form::FormTree,
2576 resolved: &HashMap<String, ResolvedFont>,
2577) {
2578 for i in 0..tree.nodes.len() {
2579 let id = xfa_layout_engine::form::FormNodeId(i);
2580 let style = &tree.meta(id).style;
2581 let font_family = style.font_family.clone();
2582 let font_weight = style.font_weight.clone();
2583 let font_style = style.font_style.clone();
2584 if let Some(ref family) = font_family {
2585 let variant_key =
2587 font_variant_key(family, font_weight.as_deref(), font_style.as_deref());
2588 let base_key = font_variant_key(family, None, None);
2589 let font = resolved
2590 .get(&variant_key)
2591 .or_else(|| resolved.get(&base_key));
2592 if let Some(font) = font {
2593 let (_first_char, widths) = font.pdf_glyph_widths();
2594 let node = tree.get_mut(id);
2595 node.font.resolved_widths = Some(widths);
2596 node.font.resolved_upem = Some(font.units_per_em);
2597 node.font.resolved_ascender = Some(font.ascender);
2598 node.font.resolved_descender = Some(font.descender);
2599 }
2600 }
2601 }
2602}
2603
2604fn simple_encoding_unicode_to_code_map(encoding: &PdfSimpleEncoding) -> HashMap<u16, u8> {
2609 let mut map = HashMap::new();
2610 for (code, unicode) in encoding.code_to_unicode_table().into_iter().enumerate() {
2611 if let Some(cp) = unicode {
2612 map.entry(cp).or_insert(code as u8);
2613 }
2614 }
2615 map
2616}
2617
2618fn add_text_chars_for_font(
2619 chars_by_font: &mut HashMap<String, HashSet<char>>,
2620 font_family: Option<&str>,
2621 font_weight: Option<&str>,
2622 font_style: Option<&str>,
2623 text: &str,
2624) {
2625 let Some(family) = font_family else {
2626 return;
2627 };
2628 if text.is_empty() {
2629 return;
2630 }
2631 let chars: Vec<char> = text.chars().filter(|c| !c.is_control()).collect();
2632 if chars.is_empty() {
2633 return;
2634 }
2635
2636 let variant = font_variant_key(family, font_weight, font_style);
2637 chars_by_font
2638 .entry(variant)
2639 .or_default()
2640 .extend(chars.iter().copied());
2641 chars_by_font
2642 .entry(family.to_string())
2643 .or_default()
2644 .extend(chars);
2645}
2646
2647fn add_text_chars_for_style(
2648 chars_by_font: &mut HashMap<String, HashSet<char>>,
2649 style: &FormNodeStyle,
2650 text: &str,
2651) {
2652 add_text_chars_for_font(
2653 chars_by_font,
2654 style.font_family.as_deref(),
2655 style.font_weight.as_deref(),
2656 style.font_style.as_deref(),
2657 text,
2658 );
2659}
2660
2661fn collect_used_chars_from_layout_node(
2662 node: &LayoutNode,
2663 chars_by_font: &mut HashMap<String, HashSet<char>>,
2664) {
2665 match &node.content {
2666 LayoutContent::Text(t) => add_text_chars_for_style(chars_by_font, &node.style, t),
2667 LayoutContent::Field { value, .. } => {
2668 add_text_chars_for_style(chars_by_font, &node.style, value)
2669 }
2670 LayoutContent::WrappedText { lines, .. } => {
2671 for line in lines {
2672 add_text_chars_for_style(chars_by_font, &node.style, line);
2673 }
2674 }
2675 LayoutContent::Draw(DrawContent::Text(t)) => {
2676 add_text_chars_for_style(chars_by_font, &node.style, t)
2677 }
2678 _ => {}
2679 }
2680
2681 if let Some(caption) = &node.style.caption_text {
2682 add_text_chars_for_style(chars_by_font, &node.style, caption);
2683 }
2684
2685 if let Some(spans) = &node.style.rich_text_spans {
2686 for span in spans {
2687 add_text_chars_for_font(
2688 chars_by_font,
2689 span.font_family
2690 .as_deref()
2691 .or(node.style.font_family.as_deref()),
2692 span.font_weight
2693 .as_deref()
2694 .or(node.style.font_weight.as_deref()),
2695 span.font_style
2696 .as_deref()
2697 .or(node.style.font_style.as_deref()),
2698 &span.text,
2699 );
2700 }
2701 }
2702
2703 for child in &node.children {
2704 collect_used_chars_from_layout_node(child, chars_by_font);
2705 }
2706}
2707
2708fn collect_used_chars_by_font(layout: &LayoutDom) -> HashMap<String, HashSet<char>> {
2709 let mut chars_by_font = HashMap::new();
2710 for page in &layout.pages {
2711 for node in &page.nodes {
2712 collect_used_chars_from_layout_node(node, &mut chars_by_font);
2713 }
2714 }
2715 chars_by_font
2716}
2717
2718fn simple_font_can_encode_char(font: &ResolvedFont, ch: char) -> bool {
2719 if ch.is_ascii() {
2720 return true;
2721 }
2722 if let Some(encoding) = &font.pdf_encoding {
2723 let Ok(cp) = u16::try_from(ch as u32) else {
2724 return false;
2725 };
2726 return encoding
2727 .code_to_unicode_table()
2728 .into_iter()
2729 .flatten()
2730 .any(|u| u == cp);
2731 }
2732 unicode_to_winansi(ch).is_some()
2733}
2734
2735fn variant_key_base_name(key: &str) -> Option<&str> {
2736 key.strip_suffix("_Bold_Italic")
2737 .or_else(|| key.strip_suffix("_Bold_Normal"))
2738 .or_else(|| key.strip_suffix("_Normal_Italic"))
2739 .or_else(|| key.strip_suffix("_Normal_Normal"))
2740}
2741
2742#[allow(clippy::type_complexity)]
2743fn embed_resolved_fonts(
2744 doc: &mut Document,
2745 resolved: &HashMap<String, ResolvedFont>,
2746 layout: &LayoutDom,
2747) -> (
2748 HashMap<String, String>,
2749 Vec<(String, ObjectId)>,
2750 HashMap<String, FontMetricsData>,
2751) {
2752 let mut font_map = HashMap::new();
2753 let mut font_objects = Vec::new();
2754 let mut metrics_data = HashMap::new();
2755 let used_chars_by_font = collect_used_chars_by_font(layout);
2756 for (idx, (name, font)) in resolved.iter().enumerate() {
2757 let resource_name = format!("XFA_F{}", idx);
2758 let used_chars = used_chars_by_font
2771 .get(name)
2772 .or_else(|| used_chars_by_font.get(&font.name))
2773 .or_else(|| variant_key_base_name(name).and_then(|base| used_chars_by_font.get(base)));
2774 let source_can_encode_all_text = used_chars.is_none_or(|chars| {
2775 chars
2776 .iter()
2777 .all(|ch| simple_font_can_encode_char(font, *ch))
2778 });
2779 let (obj_id, render_font_data) = if let Some(source_font) = font.pdf_source_font {
2780 if source_can_encode_all_text || font.data.is_empty() {
2781 (source_font.object_id, None)
2782 } else {
2783 (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2784 }
2785 } else {
2786 (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2787 };
2788 font_map.insert(name.clone(), format!("/{}", resource_name));
2789 font_objects.push((resource_name, obj_id));
2790 let (_first_char, widths) = font.pdf_glyph_widths();
2791 metrics_data.insert(
2792 name.clone(),
2793 FontMetricsData {
2794 widths,
2795 upem: font.units_per_em,
2796 ascender: font.ascender,
2797 descender: font.descender,
2798 font_data: render_font_data,
2799 face_index: font.face_index,
2800 simple_unicode_to_code: font
2801 .pdf_encoding
2802 .as_ref()
2803 .map(simple_encoding_unicode_to_code_map),
2804 },
2805 );
2806 }
2807 (font_map, font_objects, metrics_data)
2808}
2809
2810fn static_fallback(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
2817 let mut doc = match Document::load_mem(pdf_bytes) {
2818 Ok(d) => d,
2819 Err(e) => {
2820 eprintln!("static_fallback: lopdf load failed ({e}), returning original bytes");
2821 return Ok(pdf_bytes.to_vec());
2822 }
2823 };
2824 strip_widgets_and_acroform(&mut doc);
2825 javascript_policy::strip_javascript_for_flatten(&mut doc);
2826 let mut out = Vec::new();
2827 if let Err(e) = doc.save_to(&mut out) {
2828 eprintln!("static_fallback: save failed ({e}), returning original bytes");
2829 return Ok(pdf_bytes.to_vec());
2830 }
2831 Ok(out)
2832}
2833
2834std::thread_local! {
2837 static FORM_DOM_MATCH_LOG: std::cell::RefCell<Option<Vec<FormDomMatchEntry>>> =
2838 const { std::cell::RefCell::new(None) };
2839}
2840
2841pub(crate) fn apply_form_dom_presence(
2859 tree: &mut FormTree,
2860 root_id: FormNodeId,
2861 form_xml: &str,
2862 policy: XfaRenderingPolicy,
2863 admit_databound_override: bool,
2864) -> (usize, usize, Vec<FormDomMatchEntry>) {
2865 use xfa_layout_engine::form::{FormNodeType, Presence};
2866
2867 let Ok(doc) = roxmltree::Document::parse(form_xml) else {
2868 return (0, 0, Vec::new());
2869 };
2870
2871 fn clone_subtree(tree: &mut FormTree, src_id: FormNodeId) -> FormNodeId {
2873 let node = tree.get(src_id).clone();
2874 let meta = tree.meta(src_id).clone();
2875 let child_ids: Vec<FormNodeId> = node.children.clone();
2877 let mut new_node = node;
2878 new_node.children = Vec::new();
2879 let mut new_meta = meta;
2881 new_meta.xfa_id = None;
2882 let new_id = tree.add_node_with_meta(new_node, new_meta);
2883 for &child_id in &child_ids {
2885 let cloned_child = clone_subtree(tree, child_id);
2886 tree.get_mut(new_id).children.push(cloned_child);
2887 }
2888 new_id
2889 }
2890
2891 fn extract_field_value(xml_field: roxmltree::Node<'_, '_>) -> Option<String> {
2894 let value_el = xml_field
2895 .children()
2896 .find(|c| c.is_element() && c.tag_name().name() == "value")?;
2897 let inner = value_el.children().find(|c| c.is_element())?;
2899 inner.text().map(|t| t.to_string())
2900 }
2901
2902 fn child_matches(tree: &FormTree, fid: FormNodeId, xml_tag: &str, xml_name: &str) -> bool {
2906 use xfa_layout_engine::form::FormNodeType;
2907 let node = tree.get(fid);
2908 match (xml_tag, &node.node_type) {
2909 ("pageSet", FormNodeType::PageSet) => true,
2910 ("pageArea", FormNodeType::PageArea { .. }) => node.name == xml_name,
2911 ("subform", FormNodeType::Subform | FormNodeType::Area | FormNodeType::ExclGroup) => {
2912 node.name == xml_name
2913 }
2914 ("field", FormNodeType::Field { .. }) => node.name == xml_name,
2915 ("draw", FormNodeType::Draw(_) | FormNodeType::Image { .. }) => node.name == xml_name,
2916 _ => false,
2917 }
2918 }
2919
2920 fn apply_recursive(
2923 tree: &mut FormTree,
2924 form_node_id: FormNodeId,
2925 xml_node: roxmltree::Node<'_, '_>,
2926 policy: XfaRenderingPolicy,
2927 admit_databound_override: bool,
2928 ) -> usize {
2929 let mut admitted: usize = 0;
2930 let xml_tag = xml_node.tag_name().name();
2931 if !matches!(
2932 xml_tag,
2933 "subform" | "field" | "form" | "pageSet" | "pageArea"
2934 ) {
2935 return 0;
2936 }
2937
2938 if xml_tag == "subform" || xml_tag == "field" || xml_tag == "pageArea" {
2940 if let Some(pres) = xml_node.attribute("presence") {
2941 if pres == "hidden" {
2942 if std::env::var("XFA_PRESENCE_PROV").ok().as_deref() == Some("1") {
2945 eprintln!(
2946 "XFA_PRESENCE_PROV site=formdom_explicit id={} name={:?} tag={}",
2947 form_node_id.0,
2948 tree.get(form_node_id).name,
2949 xml_tag
2950 );
2951 }
2952 tree.meta_mut(form_node_id).presence = Presence::Hidden;
2953 }
2954 }
2955 }
2956
2957 if xml_tag == "field" {
2960 if let Some(val) = extract_field_value(xml_node) {
2961 if let FormNodeType::Field { ref value, .. } = tree.get(form_node_id).node_type {
2962 if value.is_empty() {
2963 tree.get_mut(form_node_id).node_type = FormNodeType::Field { value: val };
2964 }
2965 }
2966 }
2967 return 0; }
2969
2970 let xml_children: Vec<roxmltree::Node<'_, '_>> = xml_node
2977 .children()
2978 .filter(|c| {
2979 c.is_element()
2980 && matches!(
2981 c.tag_name().name(),
2982 "subform" | "field" | "draw" | "pageSet" | "pageArea"
2983 )
2984 })
2985 .collect();
2986
2987 let inside_page_set = xml_tag == "pageSet";
3002 let uniform_page_area_template = if inside_page_set {
3003 let mut names: Vec<&str> = xml_children
3004 .iter()
3005 .filter(|c| c.tag_name().name() == "pageArea")
3006 .map(|c| c.attribute("name").unwrap_or(""))
3007 .collect();
3008 names.sort_unstable();
3009 names.dedup();
3010 names.len() == 1
3011 } else {
3012 false
3013 };
3014
3015 let mut xml_groups: Vec<((&str, &str), Vec<roxmltree::Node<'_, '_>>)> = Vec::new();
3020 for &xc in &xml_children {
3021 let xtag = xc.tag_name().name();
3022 let xname = xc.attribute("name").unwrap_or("");
3023 let key = (xtag, xname);
3024 if let Some(last) = xml_groups.last_mut() {
3025 if last.0 == key {
3026 last.1.push(xc);
3027 continue;
3028 }
3029 }
3030 xml_groups.push((key, vec![xc]));
3031 }
3032
3033 let mut form_children = tree.get(form_node_id).children.clone();
3035 let mut used = vec![false; form_children.len()];
3036
3037 for (gkey, group_xml_nodes) in &xml_groups {
3038 let (gtag, gname) = *gkey;
3039 let xml_count = group_xml_nodes.len();
3040
3041 let existing: Vec<(usize, FormNodeId)> = form_children
3043 .iter()
3044 .enumerate()
3045 .filter(|(i, &fid)| !used[*i] && child_matches(tree, fid, gtag, gname))
3046 .map(|(i, &fid)| (i, fid))
3047 .collect();
3048 let existing_count = existing.len();
3049
3050 let expansion_allowed = if gtag == "pageArea" {
3059 inside_page_set && uniform_page_area_template
3060 } else {
3061 true
3062 };
3063
3064 if expansion_allowed && xml_count > existing_count && existing_count > 0 {
3066 let template_id = existing[0].1;
3067 let last_existing_idx = existing.last().expect("existing_count > 0").0;
3070 let insert_pos = last_existing_idx + 1;
3071 let clones_needed = xml_count - existing_count;
3072 let mut new_ids = Vec::new();
3073 for _ in 0..clones_needed {
3074 let cloned = clone_subtree(tree, template_id);
3075 new_ids.push(cloned);
3076 }
3077 for (offset, new_id) in new_ids.iter().enumerate() {
3079 form_children.insert(insert_pos + offset, *new_id);
3080 used.insert(insert_pos + offset, false);
3081 }
3082 tree.get_mut(form_node_id).children = form_children.clone();
3084 }
3085
3086 if gtag == "pageArea" && expansion_allowed && xml_count > existing_count {
3093 let to_mark: Vec<FormNodeId> = form_children
3094 .iter()
3095 .copied()
3096 .filter(|&fid| child_matches(tree, fid, gtag, gname))
3097 .collect();
3098 for fid in to_mark {
3099 tree.meta_mut(fid).runtime_instantiated_page = true;
3100 }
3101 }
3102
3103 for (group_idx, &xc) in group_xml_nodes.iter().enumerate() {
3105 let matched = form_children
3107 .iter()
3108 .enumerate()
3109 .skip(if group_idx > 0 {
3110 form_children
3112 .iter()
3113 .enumerate()
3114 .rfind(|(i, &fid)| used[*i] && child_matches(tree, fid, gtag, gname))
3115 .map(|(i, _)| i + 1)
3116 .unwrap_or(0)
3117 } else {
3118 0
3119 })
3120 .find(|(i, &fid)| !used[*i] && child_matches(tree, fid, gtag, gname));
3121 if let Some((idx, &fid)) = matched {
3122 used[idx] = true;
3123 admitted += apply_recursive(tree, fid, xc, policy, admit_databound_override);
3124 }
3125 }
3126 }
3127
3128 let has_subform_children = xml_children
3138 .iter()
3139 .any(|c| c.tag_name().name() == "subform");
3140 if has_subform_children {
3141 for (i, &fid) in form_children.iter().enumerate() {
3142 if used[i] {
3143 continue;
3144 }
3145 let child_node = tree.get(fid);
3146 if matches!(child_node.node_type, FormNodeType::Subform)
3149 && !child_node.name.is_empty()
3150 {
3151 let meta = tree.meta(fid);
3172 let admit_unmatched_databound = (policy
3173 == XfaRenderingPolicy::FreshMergeExperimental
3174 || admit_databound_override)
3175 && !matches!(meta.presence, Presence::Hidden | Presence::Inactive)
3176 && !meta.is_zero_instance_prototype
3177 && meta.bound_data_node.is_some()
3178 && !meta.data_bind_none;
3179
3180 if std::env::var("XFA_PRESENCE_PROV").ok().as_deref() == Some("1") {
3181 let site = if admit_unmatched_databound {
3182 if policy == XfaRenderingPolicy::FreshMergeExperimental {
3183 "formdom_unmatched_fresh_merge_admitted"
3184 } else {
3185 "formdom_unmatched_databound_admitted"
3186 }
3187 } else {
3188 "formdom_unmatched"
3189 };
3190 eprintln!(
3191 "XFA_PRESENCE_PROV site={site} id={} name={:?}",
3192 fid.0, child_node.name
3193 );
3194 }
3195
3196 if admit_unmatched_databound {
3197 admitted += 1;
3200 } else {
3201 let suppressed_name = child_node.name.clone();
3205 let suppressed_id = fid.0;
3206 tree.meta_mut(fid).presence = Presence::Hidden;
3207 crate::flatten::FORM_DOM_MATCH_LOG.with(|cell| {
3208 if let Some(ref mut log) = *cell.borrow_mut() {
3209 if log.len() < 200 {
3210 log.push(crate::dynamic::FormDomMatchEntry {
3211 template_node_id: suppressed_id,
3212 template_node_name: suppressed_name,
3213 reason: "formdom_unmatched_suppressed".to_string(),
3214 });
3215 }
3216 }
3217 });
3218 }
3219 }
3220 }
3221 }
3222 admitted
3223 }
3224
3225 let form_root = doc.root_element();
3227 let form_root_subform = form_root
3228 .children()
3229 .find(|c| c.is_element() && c.tag_name().name() == "subform");
3230
3231 let diag_on = runtime_diag_enabled() || crate::flatten_trace::enabled();
3233 if diag_on {
3234 FORM_DOM_MATCH_LOG.with(|cell| {
3235 *cell.borrow_mut() = Some(Vec::new());
3236 });
3237 }
3238
3239 let mut total_admitted: usize = 0;
3240 if let Some(xml_root_sf) = form_root_subform {
3241 let root_children = tree.get(root_id).children.clone();
3242 let root_name = xml_root_sf.attribute("name").unwrap_or("");
3243 for &child_id in &root_children {
3244 if tree.get(child_id).name == root_name {
3245 total_admitted += apply_recursive(
3246 tree,
3247 child_id,
3248 xml_root_sf,
3249 policy,
3250 admit_databound_override,
3251 );
3252 break;
3253 }
3254 }
3255 }
3256
3257 let match_log = if diag_on {
3259 FORM_DOM_MATCH_LOG.with(|cell| cell.borrow_mut().take().unwrap_or_default())
3260 } else {
3261 Vec::new()
3262 };
3263 let match_failures = match_log.len();
3264 (total_admitted, match_failures, match_log)
3265}
3266
3267fn is_corrupt_xfa_template(pdf_size: usize, template_xml: &str) -> bool {
3271 if pdf_size >= 1024 {
3273 return false;
3274 }
3275 match roxmltree::Document::parse(template_xml) {
3277 Ok(doc) => {
3278 let root = doc.root_element();
3279 !root.children().any(|c| {
3280 c.is_element()
3281 && matches!(c.tag_name().name(), "subform" | "pageSet" | "subformSet")
3282 })
3283 }
3284 Err(_) => true, }
3286}
3287
3288fn strip_undefined_xml_entities(xml: &str) -> String {
3305 let predefined = ["lt", "gt", "amp", "quot", "apos"];
3306 let mut result = String::with_capacity(xml.len());
3307 let bytes = xml.as_bytes();
3308 let mut pos = 0;
3309
3310 while let Some(rel_amp_pos) = xml[pos..].find('&') {
3311 let amp_pos = pos + rel_amp_pos;
3312 result.push_str(&xml[pos..amp_pos]);
3313
3314 if let Some((entity_name, next_pos)) = parse_xml_entity_reference(xml, amp_pos) {
3315 if entity_name.starts_with('#') || predefined.contains(&entity_name) {
3319 result.push_str(&xml[amp_pos..next_pos]);
3320 }
3321 pos = next_pos;
3322 } else {
3323 result.push('&');
3325 pos = amp_pos + 1;
3326 }
3327 }
3328
3329 if pos < bytes.len() {
3330 result.push_str(&xml[pos..]);
3331 }
3332 result
3333}
3334
3335fn parse_xml_entity_reference(xml: &str, amp_pos: usize) -> Option<(&str, usize)> {
3336 let bytes = xml.as_bytes();
3337 let start = amp_pos + 1;
3338 let first = *bytes.get(start)?;
3339
3340 if first == b'#' {
3342 let mut idx = start + 1;
3343 if matches!(bytes.get(idx), Some(b'x' | b'X')) {
3344 idx += 1;
3345 let hex_start = idx;
3346 while matches!(
3347 bytes.get(idx),
3348 Some(b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')
3349 ) {
3350 idx += 1;
3351 }
3352 if idx == hex_start || !matches!(bytes.get(idx), Some(b';')) {
3353 return None;
3354 }
3355 } else {
3356 let digits_start = idx;
3357 while matches!(bytes.get(idx), Some(b'0'..=b'9')) {
3358 idx += 1;
3359 }
3360 if idx == digits_start || !matches!(bytes.get(idx), Some(b';')) {
3361 return None;
3362 }
3363 }
3364 return Some((&xml[start..idx], idx + 1));
3365 }
3366
3367 if !is_xml_name_start(first) {
3370 return None;
3371 }
3372
3373 let mut idx = start + 1;
3374 while let Some(&b) = bytes.get(idx) {
3375 if b == b';' {
3376 return Some((&xml[start..idx], idx + 1));
3377 }
3378 if !is_xml_name_char(b) {
3379 return None;
3380 }
3381 idx += 1;
3382 }
3383 None
3384}
3385
3386fn is_xml_name_start(byte: u8) -> bool {
3387 matches!(byte, b':' | b'_' | b'A'..=b'Z' | b'a'..=b'z')
3388}
3389
3390fn is_xml_name_char(byte: u8) -> bool {
3391 is_xml_name_start(byte) || matches!(byte, b'-' | b'.' | b'0'..=b'9')
3392}
3393
3394fn pages_have_static_content(doc: &Document) -> bool {
3407 for page_id in doc.page_iter() {
3408 let streams = page_content_streams(doc, page_id);
3409 if streams.is_empty() {
3410 continue;
3411 }
3412
3413 let mut text_op_count = 0usize;
3419 for stream in &streams {
3420 if is_xfa_placeholder_stream(stream) || is_watermark_stream(stream) {
3421 continue;
3422 }
3423 text_op_count += count_text_operators(stream);
3424 }
3425
3426 if text_op_count >= 5 {
3427 return true;
3428 }
3429 }
3430 false
3431}
3432
3433fn page_content_streams(doc: &Document, page_id: ObjectId) -> Vec<Vec<u8>> {
3434 let Ok(page_dict) = doc.get_dictionary(page_id) else {
3435 return Vec::new();
3436 };
3437
3438 match page_dict.get(b"Contents") {
3439 Ok(Object::Array(arr)) => arr
3440 .iter()
3441 .filter_map(|object| resolve_stream_content(doc, object))
3442 .collect(),
3443 Ok(Object::Reference(id)) => match doc.get_object(*id) {
3444 Ok(Object::Array(arr)) => arr
3445 .iter()
3446 .filter_map(|object| resolve_stream_content(doc, object))
3447 .collect(),
3448 Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
3449 Err(_) => Vec::new(),
3450 },
3451 Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
3452 Err(_) => Vec::new(),
3453 }
3454}
3455
3456fn resolve_stream_content(doc: &Document, object: &Object) -> Option<Vec<u8>> {
3457 let stream = match object {
3458 Object::Reference(id) => doc.get_object(*id).ok()?.as_stream().ok()?,
3459 Object::Stream(stream) => stream,
3460 _ => return None,
3461 };
3462
3463 stream
3464 .get_plain_content()
3465 .ok()
3466 .or_else(|| Some(stream.content.clone()))
3467}
3468
3469fn count_text_operators(stream: &[u8]) -> usize {
3471 let mut count = 0;
3472 for window in stream.windows(3) {
3473 if (window[0] == b' ' || window[0] == b')' || window[0] == b']')
3474 && window[1] == b'T'
3475 && (window[2] == b'j' || window[2] == b'J')
3476 {
3477 count += 1;
3478 }
3479 }
3480 count
3481}
3482
3483fn bake_checkbox_radio_ap_marks(doc: &mut Document, page_id: ObjectId) -> usize {
3493 let annots = page_annotations(doc, page_id);
3494 if annots.is_empty() {
3495 return 0;
3496 }
3497
3498 let mut baked = 0usize;
3499 let mut overlay_ops = Vec::new();
3500
3501 for annot in &annots {
3502 let Some(annot_id) = annot.as_reference().ok() else {
3503 continue;
3504 };
3505 let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
3506 continue;
3507 };
3508
3509 let is_widget = annot_dict
3510 .get(b"Subtype")
3511 .ok()
3512 .and_then(|obj| obj.as_name().ok())
3513 == Some(&b"Widget"[..]);
3514 if !is_widget {
3515 continue;
3516 }
3517
3518 let ap = match annot_dict.get(b"AP").ok().and_then(|o| o.as_dict().ok()) {
3522 Some(ap) => ap.clone(),
3523 None => continue,
3524 };
3525 let normal_obj = match ap.get(b"N").ok() {
3526 Some(obj) => obj.clone(),
3527 None => continue,
3528 };
3529
3530 let states: Dictionary = match &normal_obj {
3532 Object::Reference(id) => match doc.get_object(*id).ok().cloned() {
3533 Some(Object::Dictionary(d)) => d,
3534 _ => continue, },
3536 Object::Dictionary(d) => d.clone(),
3537 _ => continue,
3538 };
3539
3540 if matches!(selected_widget_state(&annot_dict), Some(state) if state == b"Off") {
3541 continue;
3542 }
3543
3544 let on_id = states
3546 .iter()
3547 .filter(|(name, _)| name.as_slice() != b"Off")
3548 .find_map(|(_, obj)| match obj {
3549 Object::Reference(id) => Some(*id),
3550 _ => None,
3551 });
3552 let Some(ap_id) = on_id else { continue };
3553
3554 match doc.get_object(ap_id).ok() {
3556 Some(Object::Stream(_)) => {}
3557 _ => continue,
3558 }
3559
3560 let Some(rect) = annotation_rect(&annot_dict) else {
3561 continue;
3562 };
3563
3564 let xobject_name = format!("XfaCbAp{}", baked);
3565 add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
3566 write_ops(
3567 &mut overlay_ops,
3568 format_args!(
3569 "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
3570 rect[0], rect[1], xobject_name
3571 ),
3572 );
3573 baked += 1;
3574 }
3575
3576 if !overlay_ops.is_empty() {
3577 append_to_page_content(doc, page_id, &overlay_ops);
3578 }
3579
3580 baked
3581}
3582
3583fn is_xfa_placeholder_stream(stream: &[u8]) -> bool {
3584 const PLACEHOLDER_MARKERS: [&[u8]; 5] = [
3585 b"Please wait",
3586 b"Adobe Reader",
3587 b"reader_download",
3588 b"display this type of document",
3589 b"To view the full contents",
3590 ];
3591
3592 PLACEHOLDER_MARKERS
3593 .iter()
3594 .any(|marker| contains_ascii_case_insensitive(stream, marker))
3595}
3596
3597fn is_watermark_stream(stream: &[u8]) -> bool {
3601 const WATERMARK_MARKERS: [&[u8]; 3] =
3602 [b"Evaluation Only", b"Qoppa Software", b"For Evaluation"];
3603 WATERMARK_MARKERS
3604 .iter()
3605 .any(|marker| contains_ascii_case_insensitive(stream, marker))
3606}
3607
3608fn contains_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
3609 haystack
3610 .windows(needle.len())
3611 .any(|window| window.eq_ignore_ascii_case(needle))
3612}
3613
3614fn write_ops(buf: &mut Vec<u8>, args: std::fmt::Arguments<'_>) {
3615 use std::fmt::Write as _;
3616
3617 let mut text = String::new();
3618 let _ = text.write_fmt(args);
3619 buf.extend_from_slice(text.as_bytes());
3620}
3621
3622fn flatten_widget_appearances(doc: &mut Document) -> usize {
3630 let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3631 let mut flattened = 0usize;
3632
3633 for page_id in page_ids {
3634 let annots = page_annotations(doc, page_id);
3635 if annots.is_empty() {
3636 continue;
3637 }
3638
3639 let mut retained = Vec::new();
3640 let mut overlay_ops = Vec::new();
3641
3642 for annot in annots {
3643 let Some(annot_id) = annot.as_reference().ok() else {
3644 retained.push(annot);
3645 continue;
3646 };
3647
3648 let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
3649 retained.push(annot);
3650 continue;
3651 };
3652
3653 let is_widget = annot_dict
3654 .get(b"Subtype")
3655 .ok()
3656 .and_then(|obj| obj.as_name().ok())
3657 == Some(&b"Widget"[..]);
3658 if !is_widget {
3659 retained.push(annot);
3660 continue;
3661 }
3662
3663 let Some(rect) = annotation_rect(&annot_dict) else {
3664 retained.push(Object::Reference(annot_id));
3665 continue;
3666 };
3667 let Some(ap_id) = resolve_widget_normal_appearance(doc, &annot_dict) else {
3668 retained.push(Object::Reference(annot_id));
3669 continue;
3670 };
3671
3672 let xobject_name = format!("XfaAp{}", flattened);
3673 add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
3674 write_ops(
3675 &mut overlay_ops,
3676 format_args!(
3677 "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
3678 rect[0], rect[1], xobject_name
3679 ),
3680 );
3681 flattened += 1;
3682 }
3683
3684 if overlay_ops.is_empty() {
3685 continue;
3686 }
3687
3688 append_to_page_content(doc, page_id, &overlay_ops);
3689 set_page_annotations(doc, page_id, retained);
3690 }
3691
3692 flattened
3693}
3694
3695fn strip_widget_annotations(doc: &mut Document, page_id: ObjectId) {
3697 let annots = page_annotations(doc, page_id);
3698 if annots.is_empty() {
3699 return;
3700 }
3701 let mut retained = Vec::new();
3702 for annot in &annots {
3703 let is_widget = annot
3704 .as_reference()
3705 .ok()
3706 .and_then(|id| doc.get_dictionary(id).ok())
3707 .and_then(|d| d.get(b"Subtype").ok())
3708 .and_then(|obj| obj.as_name().ok())
3709 == Some(&b"Widget"[..]);
3710 if !is_widget {
3711 retained.push(annot.clone());
3712 }
3713 }
3714 set_page_annotations(doc, page_id, retained);
3715}
3716
3717fn page_annotations(doc: &Document, page_id: ObjectId) -> Vec<Object> {
3718 let Ok(page_dict) = doc.get_dictionary(page_id) else {
3719 return Vec::new();
3720 };
3721
3722 match page_dict.get(b"Annots") {
3723 Ok(Object::Array(arr)) => arr.clone(),
3724 Ok(Object::Reference(id)) => doc
3725 .get_object(*id)
3726 .ok()
3727 .and_then(|obj| obj.as_array().ok().cloned())
3728 .unwrap_or_default(),
3729 _ => Vec::new(),
3730 }
3731}
3732
3733fn set_page_annotations(doc: &mut Document, page_id: ObjectId, annots: Vec<Object>) {
3734 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3735 if annots.is_empty() {
3736 page_dict.remove(b"Annots");
3737 } else {
3738 page_dict.set("Annots", Object::Array(annots));
3739 }
3740 }
3741}
3742
3743fn annotation_rect(dict: &Dictionary) -> Option<[f32; 4]> {
3744 let rect = dict.get(b"Rect").ok()?.as_array().ok()?;
3745 if rect.len() != 4 {
3746 return None;
3747 }
3748 Some([
3749 rect[0].as_float().ok()?,
3750 rect[1].as_float().ok()?,
3751 rect[2].as_float().ok()?,
3752 rect[3].as_float().ok()?,
3753 ])
3754}
3755
3756fn resolve_widget_normal_appearance(
3757 doc: &mut Document,
3758 annot_dict: &Dictionary,
3759) -> Option<ObjectId> {
3760 let ap = annot_dict.get(b"AP").ok()?.as_dict().ok()?;
3761 let normal = ap.get(b"N").ok()?;
3762 resolve_appearance_object(doc, annot_dict, normal)
3763}
3764
3765fn resolve_appearance_object(
3766 doc: &mut Document,
3767 annot_dict: &Dictionary,
3768 object: &Object,
3769) -> Option<ObjectId> {
3770 match object {
3771 Object::Reference(id) => match doc.get_object(*id).ok()?.clone() {
3772 Object::Stream(_) => Some(*id),
3773 Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, &states),
3774 _ => None,
3775 },
3776 Object::Stream(stream) => Some(doc.add_object(Object::Stream(stream.clone()))),
3777 Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, states),
3778 _ => None,
3779 }
3780}
3781
3782fn resolve_appearance_state(
3783 doc: &mut Document,
3784 annot_dict: &Dictionary,
3785 states: &Dictionary,
3786) -> Option<ObjectId> {
3787 if let Some(state) = selected_widget_state(annot_dict) {
3788 if let Ok(object) = states.get(state) {
3789 if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3790 return Some(id);
3791 }
3792 }
3793 if state == b"Off" {
3794 return None;
3797 }
3798 }
3799
3800 for fallback in [b"Yes".as_slice(), b"On".as_slice(), b"Off".as_slice()] {
3801 if let Ok(object) = states.get(fallback) {
3802 if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3803 return Some(id);
3804 }
3805 }
3806 }
3807
3808 for (_name, object) in states.iter() {
3809 if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3810 return Some(id);
3811 }
3812 }
3813
3814 None
3815}
3816
3817fn selected_widget_state(annot_dict: &Dictionary) -> Option<&[u8]> {
3818 annot_dict
3819 .get(b"AS")
3820 .ok()
3821 .and_then(|obj| obj.as_name().ok())
3822 .or_else(|| annot_dict.get(b"V").ok().and_then(|obj| obj.as_name().ok()))
3823}
3824
3825fn add_xobject_to_page_resources(
3826 doc: &mut Document,
3827 page_id: ObjectId,
3828 name: &str,
3829 xobject_id: ObjectId,
3830) {
3831 let resources_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3832 page_dict
3833 .get(b"Resources")
3834 .ok()
3835 .and_then(|obj| obj.as_reference().ok())
3836 });
3837
3838 if let Some(resources_id) = resources_ref {
3839 let xobject_ref = doc.get_dictionary(resources_id).ok().and_then(|resources| {
3840 resources
3841 .get(b"XObject")
3842 .ok()
3843 .and_then(|obj| obj.as_reference().ok())
3844 });
3845
3846 if let Some(xobject_dict_id) = xobject_ref {
3847 if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3848 xobjects.set(name, Object::Reference(xobject_id));
3849 return;
3850 }
3851 }
3852
3853 if let Ok(Object::Dictionary(ref mut resources)) = doc.get_object_mut(resources_id) {
3854 add_xobject_to_resources_dict(resources, name, xobject_id);
3855 return;
3856 }
3857 }
3858
3859 let inline_xobject_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3860 page_dict
3861 .get(b"Resources")
3862 .ok()
3863 .and_then(|obj| obj.as_dict().ok())
3864 .and_then(|resources| {
3865 resources
3866 .get(b"XObject")
3867 .ok()
3868 .and_then(|obj| obj.as_reference().ok())
3869 })
3870 });
3871
3872 if let Some(xobject_dict_id) = inline_xobject_ref {
3873 if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3874 xobjects.set(name, Object::Reference(xobject_id));
3875 return;
3876 }
3877 }
3878
3879 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3880 if let Ok(Object::Dictionary(ref mut resources)) = page_dict.get_mut(b"Resources") {
3881 add_xobject_to_resources_dict(resources, name, xobject_id);
3882 return;
3883 }
3884
3885 let mut resources = Dictionary::new();
3886 add_xobject_to_resources_dict(&mut resources, name, xobject_id);
3887 page_dict.set("Resources", Object::Dictionary(resources));
3888 }
3889}
3890
3891fn add_xobject_to_resources_dict(resources: &mut Dictionary, name: &str, xobject_id: ObjectId) {
3892 if let Ok(Object::Dictionary(ref mut xobjects)) = resources.get_mut(b"XObject") {
3893 xobjects.set(name, Object::Reference(xobject_id));
3894 } else {
3895 let mut xobjects = Dictionary::new();
3896 xobjects.set(name, Object::Reference(xobject_id));
3897 resources.set("XObject", Object::Dictionary(xobjects));
3898 }
3899}
3900
3901fn append_to_page_content(doc: &mut Document, page_id: ObjectId, data: &[u8]) {
3902 let new_stream_id = doc.add_object(Object::Stream(Stream::new(dictionary! {}, data.to_vec())));
3903
3904 let contents = doc
3905 .get_dictionary(page_id)
3906 .ok()
3907 .and_then(|page_dict| page_dict.get(b"Contents").ok().cloned());
3908
3909 let new_contents = match contents {
3916 Some(existing) => {
3917 let mut flattened = Vec::new();
3918 flatten_page_contents_entries(doc, existing, &mut flattened);
3919 flattened.push(Object::Reference(new_stream_id));
3920 if flattened.len() == 1 {
3921 flattened.pop().expect("flattened.len() == 1")
3923 } else {
3924 Object::Array(flattened)
3925 }
3926 }
3927 None => Object::Reference(new_stream_id),
3928 };
3929
3930 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3931 page_dict.set("Contents", new_contents);
3932 }
3933}
3934
3935fn flatten_page_contents_entries(doc: &mut Document, object: Object, out: &mut Vec<Object>) {
3936 match object {
3937 Object::Reference(id) => match doc.get_object(id).cloned() {
3938 Ok(Object::Array(items)) => {
3939 for item in items {
3940 flatten_page_contents_entries(doc, item, out);
3941 }
3942 }
3943 _ => out.push(Object::Reference(id)),
3944 },
3945 Object::Array(items) => {
3946 for item in items {
3947 flatten_page_contents_entries(doc, item, out);
3948 }
3949 }
3950 Object::Stream(stream) => {
3951 let stream_id = doc.add_object(Object::Stream(stream));
3952 out.push(Object::Reference(stream_id));
3953 }
3954 other => out.push(other),
3955 }
3956}
3957
3958fn strip_widgets_and_acroform(doc: &mut Document) {
3964 remove_acroform(doc);
3965}
3966
3967fn write_page_content(
3969 doc: &mut Document,
3970 page_id: ObjectId,
3971 overlay: &PageOverlay,
3972 font_ids: &[ObjectId; 3],
3973 embedded_fonts: &[(String, ObjectId)],
3974 page_width: Option<f64>,
3975 page_height: Option<f64>,
3976) -> Result<()> {
3977 let mut resources = make_resources_dict(font_ids, embedded_fonts);
3978
3979 let mut xobjects = Dictionary::new();
3980 for img in &overlay.images {
3981 match embed_image(doc, &img.data, &img.mime_type) {
3982 Ok(result) => {
3983 xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3984 }
3985 Err(e) => {
3986 eprintln!("failed to embed image {}: {}", img.name, e);
3987 }
3988 }
3989 }
3990 if !xobjects.is_empty() {
3991 resources.set("XObject", Object::Dictionary(xobjects));
3992 }
3993
3994 let stream = Stream::new(
3995 dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
3996 overlay.content_stream.clone(),
3997 );
3998 let stream_id = doc.add_object(Object::Stream(stream));
3999
4000 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
4001 page_dict.set("Contents", Object::Reference(stream_id));
4002 page_dict.set("Resources", Object::Dictionary(resources));
4003 if let (Some(w), Some(h)) = (page_width, page_height) {
4007 page_dict.set(
4008 "MediaBox",
4009 Object::Array(vec![
4010 Object::Real(0.0),
4011 Object::Real(0.0),
4012 Object::Real(w as f32),
4013 Object::Real(h as f32),
4014 ]),
4015 );
4016 }
4017 }
4018 Ok(())
4019}
4020
4021fn overlay_page_content(
4027 doc: &mut Document,
4028 page_id: ObjectId,
4029 overlay: &PageOverlay,
4030 font_ids: &[ObjectId; 3],
4031 embedded_fonts: &[(String, ObjectId)],
4032) -> Result<()> {
4033 let xfa_resources = make_resources_dict(font_ids, embedded_fonts);
4034
4035 let mut xfa_xobjects = Dictionary::new();
4036 for img in &overlay.images {
4037 match embed_image(doc, &img.data, &img.mime_type) {
4038 Ok(result) => {
4039 xfa_xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
4040 }
4041 Err(e) => {
4042 eprintln!("failed to embed image {}: {}", img.name, e);
4043 }
4044 }
4045 }
4046
4047 merge_xfa_resources_into_page(doc, page_id, &xfa_resources, &xfa_xobjects);
4048
4049 if !overlay.content_stream.is_empty() {
4050 append_to_page_content(doc, page_id, &overlay.content_stream);
4051 }
4052
4053 Ok(())
4054}
4055
4056fn merge_xfa_resources_into_page(
4059 doc: &mut Document,
4060 page_id: ObjectId,
4061 xfa_resources: &Dictionary,
4062 xfa_xobjects: &Dictionary,
4063) {
4064 let existing_resources = doc
4065 .get_dictionary(page_id)
4066 .ok()
4067 .and_then(|page_dict| {
4068 page_dict.get(b"Resources").ok().and_then(|obj| match obj {
4069 Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
4070 Object::Dictionary(d) => Some(d.clone()),
4071 _ => None,
4072 })
4073 })
4074 .unwrap_or_default();
4075
4076 let mut merged = existing_resources;
4077
4078 if let Ok(xfa_font_dict) = xfa_resources.get(b"Font").and_then(|o| o.as_dict()) {
4081 let existing_font = merged
4082 .get(b"Font")
4083 .ok()
4084 .and_then(|obj| match obj {
4085 Object::Dictionary(d) => Some(d.clone()),
4086 Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
4087 _ => None,
4088 })
4089 .unwrap_or_default();
4090
4091 let mut font_merged = existing_font;
4092 for (key, val) in xfa_font_dict.iter() {
4093 if font_merged.get(key).is_err() {
4094 font_merged.set(key.clone(), val.clone());
4095 }
4096 }
4097 merged.set("Font", Object::Dictionary(font_merged));
4098 }
4099
4100 if !xfa_xobjects.is_empty() {
4102 let existing_xobj = merged
4103 .get(b"XObject")
4104 .ok()
4105 .and_then(|obj| match obj {
4106 Object::Dictionary(d) => Some(d.clone()),
4107 Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
4108 _ => None,
4109 })
4110 .unwrap_or_default();
4111
4112 let mut xobj_merged = existing_xobj;
4113 for (key, val) in xfa_xobjects.iter() {
4114 xobj_merged.set(key.clone(), val.clone());
4115 }
4116 merged.set("XObject", Object::Dictionary(xobj_merged));
4117 }
4118
4119 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
4120 page_dict.set("Resources", Object::Dictionary(merged));
4121 }
4122}
4123
4124fn add_new_page(
4126 doc: &mut Document,
4127 w: f64,
4128 h: f64,
4129 overlay: &PageOverlay,
4130 font_ids: &[ObjectId; 3],
4131 embedded_fonts: &[(String, ObjectId)],
4132) -> Result<()> {
4133 let mut resources = make_resources_dict(font_ids, embedded_fonts);
4134
4135 let mut xobjects = Dictionary::new();
4136 for img in &overlay.images {
4137 match embed_image(doc, &img.data, &img.mime_type) {
4138 Ok(result) => {
4139 xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
4140 }
4141 Err(e) => {
4142 eprintln!("failed to embed image {}: {}", img.name, e);
4143 }
4144 }
4145 }
4146 if !xobjects.is_empty() {
4147 resources.set("XObject", Object::Dictionary(xobjects));
4148 }
4149
4150 let stream = Stream::new(
4151 dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
4152 overlay.content_stream.clone(),
4153 );
4154 let stream_id = doc.add_object(Object::Stream(stream));
4155
4156 let pages_id = find_pages_root(doc)?;
4158
4159 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4160 "Type" => Object::Name(b"Page".to_vec()),
4161 "Parent" => Object::Reference(pages_id),
4162 "MediaBox" => Object::Array(vec![
4163 Object::Integer(0), Object::Integer(0),
4164 Object::Real(w as f32), Object::Real(h as f32),
4165 ]),
4166 "Contents" => Object::Reference(stream_id),
4167 "Resources" => Object::Dictionary(resources)
4168 }));
4169
4170 if let Ok(Object::Dictionary(ref mut pages_dict)) = doc.get_object_mut(pages_id) {
4172 if let Ok(Object::Array(ref mut kids)) = pages_dict.get_mut(b"Kids") {
4173 kids.push(Object::Reference(page_id));
4174 }
4175 if let Ok(Object::Integer(ref mut count)) = pages_dict.get_mut(b"Count") {
4176 *count += 1;
4177 }
4178 }
4179 Ok(())
4180}
4181
4182fn make_resources_dict(
4183 font_ids: &[ObjectId; 3],
4184 embedded_fonts: &[(String, ObjectId)],
4185) -> Dictionary {
4186 let mut fonts = Dictionary::new();
4187 fonts.set("F1", Object::Reference(font_ids[0]));
4188 fonts.set("F2", Object::Reference(font_ids[1]));
4189 fonts.set("F3", Object::Reference(font_ids[2]));
4190 for (name, obj_id) in embedded_fonts {
4191 fonts.set(name.as_str(), Object::Reference(*obj_id));
4192 }
4193 let mut resources = Dictionary::new();
4194 resources.set("Font", Object::Dictionary(fonts));
4195 resources
4196}
4197
4198fn find_pages_root(doc: &Document) -> Result<ObjectId> {
4199 let root_id = doc
4200 .trailer
4201 .get(b"Root")
4202 .ok()
4203 .and_then(|o: &Object| o.as_reference().ok())
4204 .ok_or_else(|| XfaError::LoadFailed("no /Root in trailer".to_string()))?;
4205 let catalog = doc
4206 .get_dictionary(root_id)
4207 .map_err(|e| XfaError::LoadFailed(format!("catalog: {e}")))?;
4208 catalog
4209 .get(b"Pages")
4210 .ok()
4211 .and_then(|o: &Object| o.as_reference().ok())
4212 .ok_or_else(|| XfaError::LoadFailed("no /Pages in catalog".to_string()))
4213}
4214
4215fn remove_acroform(doc: &mut Document) {
4229 let root_id = match doc.trailer.get(b"Root") {
4230 Ok(Object::Reference(id)) => *id,
4231 _ => return,
4232 };
4233
4234 let acroform_id: Option<ObjectId> = {
4237 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
4238 let acroform_ref = dict.get(b"AcroForm").ok().and_then(|o| {
4239 if let Object::Reference(id) = o {
4240 Some(*id)
4241 } else {
4242 None
4243 }
4244 });
4245 dict.remove(b"AcroForm");
4246 dict.remove(b"NeedsRendering");
4247 acroform_ref
4248 } else {
4249 None
4250 }
4251 };
4252
4253 let xfa_stream_ids: Vec<ObjectId> = acroform_id
4256 .and_then(|af_id| doc.get_dictionary(af_id).ok())
4257 .map(|af_dict| match af_dict.get(b"XFA") {
4258 Ok(Object::Array(arr)) => arr
4259 .iter()
4260 .filter_map(|o| {
4261 if let Object::Reference(id) = o {
4262 Some(*id)
4263 } else {
4264 None
4265 }
4266 })
4267 .collect(),
4268 Ok(Object::Reference(id)) => vec![*id],
4269 _ => Vec::new(),
4270 })
4271 .unwrap_or_default();
4272
4273 if let Some(af_id) = acroform_id {
4274 if let Ok(Object::Dictionary(ref mut af_dict)) = doc.get_object_mut(af_id) {
4275 af_dict.remove(b"XFA");
4276 }
4277 }
4278
4279 for stream_id in xfa_stream_ids {
4283 doc.objects.remove(&stream_id);
4284 }
4285 if let Some(af_id) = acroform_id {
4286 doc.objects.remove(&af_id);
4287 }
4288
4289 let page_ids: Vec<ObjectId> = doc.page_iter().collect();
4292 for page_id in page_ids {
4293 strip_widget_annotations(doc, page_id);
4294 }
4295}
4296
4297pub struct FlattenValidation {
4306 pub has_no_xfa: bool,
4308 pub has_no_needs_rendering: bool,
4310 pub has_no_acroform: bool,
4312 pub page_count: usize,
4314 pub warnings: Vec<String>,
4316}
4317
4318pub fn validate_flattened_pdf(pdf_bytes: &[u8]) -> Result<FlattenValidation> {
4326 if pdf_bytes.is_empty() {
4327 return Ok(FlattenValidation {
4328 has_no_xfa: true,
4329 has_no_needs_rendering: true,
4330 has_no_acroform: true,
4331 page_count: 0,
4332 warnings: vec!["empty input — no PDF to validate".into()],
4333 });
4334 }
4335
4336 let doc = match Document::load_mem(pdf_bytes) {
4337 Ok(d) => d,
4338 Err(e) => {
4339 return Ok(FlattenValidation {
4340 has_no_xfa: false,
4341 has_no_needs_rendering: false,
4342 has_no_acroform: false,
4343 page_count: 0,
4344 warnings: vec![format!("could not parse PDF: {e}")],
4345 });
4346 }
4347 };
4348
4349 let mut warnings = Vec::new();
4350 let mut has_no_xfa = true;
4351 let mut has_no_needs_rendering = true;
4352 let mut has_no_acroform = true;
4353
4354 let root_id = doc.trailer.get(b"Root").ok().and_then(|o| {
4356 if let Object::Reference(id) = o {
4357 Some(*id)
4358 } else {
4359 None
4360 }
4361 });
4362
4363 if let Some(rid) = root_id {
4364 if let Ok(catalog) = doc.get_dictionary(rid) {
4365 if catalog.get(b"AcroForm").is_ok() {
4366 has_no_acroform = false;
4367 warnings.push("/AcroForm still present in catalog".into());
4368
4369 let acroform_has_xfa = catalog
4371 .get(b"AcroForm")
4372 .ok()
4373 .and_then(|o| match o {
4374 Object::Reference(id) => doc.get_dictionary(*id).ok(),
4375 Object::Dictionary(d) => Some(d),
4376 _ => None,
4377 })
4378 .map(|d| d.get(b"XFA").is_ok())
4379 .unwrap_or(false);
4380
4381 if acroform_has_xfa {
4382 has_no_xfa = false;
4383 warnings.push("/XFA still present in AcroForm dictionary".into());
4384 }
4385 }
4386
4387 if catalog.get(b"NeedsRendering").is_ok() {
4388 has_no_needs_rendering = false;
4389 warnings.push("/NeedsRendering still present in catalog".into());
4390 }
4391
4392 if catalog.get(b"XFA").is_ok() {
4394 has_no_xfa = false;
4395 warnings.push("/XFA still present directly in catalog".into());
4396 }
4397 }
4398 }
4399
4400 let page_ids: Vec<ObjectId> = doc.page_iter().collect();
4402 let page_count = page_ids.len();
4403 for page_id in page_ids {
4404 for annot_obj in page_annotations(&doc, page_id) {
4405 let is_widget = annot_obj
4406 .as_reference()
4407 .ok()
4408 .and_then(|id| doc.get_dictionary(id).ok())
4409 .and_then(|d| {
4410 d.get(b"Subtype")
4411 .ok()
4412 .map(|st| st == &Object::Name(b"Widget".to_vec()))
4413 })
4414 .unwrap_or(false);
4415 if is_widget {
4416 warnings.push(format!(
4417 "widget annotation found on page (object {:?})",
4418 annot_obj
4419 ));
4420 }
4421 }
4422 }
4423
4424 Ok(FlattenValidation {
4425 has_no_xfa,
4426 has_no_needs_rendering,
4427 has_no_acroform,
4428 page_count,
4429 warnings,
4430 })
4431}
4432
4433pub struct FlattenQualityMetrics {
4441 pub page_count_before: usize,
4443 pub page_count_after: usize,
4445 pub page_count_match: bool,
4447 pub content_stream_bytes_before: usize,
4449 pub content_stream_bytes_after: usize,
4451 pub content_ratio: f64,
4454}
4455
4456pub fn compare_flatten_quality(
4461 original_bytes: &[u8],
4462 flattened_bytes: &[u8],
4463) -> Result<FlattenQualityMetrics> {
4464 fn count_pages_and_stream_bytes(pdf_bytes: &[u8]) -> (usize, usize) {
4465 let doc = match Document::load_mem(pdf_bytes) {
4466 Ok(d) => d,
4467 Err(_) => return (0, 0),
4468 };
4469 let page_count = doc.page_iter().count();
4470 let stream_bytes: usize = doc
4471 .objects
4472 .values()
4473 .filter_map(|obj| {
4474 if let Object::Stream(s) = obj {
4475 s.content.len().into()
4477 } else {
4478 None
4479 }
4480 })
4481 .sum();
4482 (page_count, stream_bytes)
4483 }
4484
4485 let (page_count_before, content_stream_bytes_before) =
4486 count_pages_and_stream_bytes(original_bytes);
4487 let (page_count_after, content_stream_bytes_after) =
4488 count_pages_and_stream_bytes(flattened_bytes);
4489
4490 let content_ratio = if content_stream_bytes_before == 0 {
4491 1.0_f64
4492 } else {
4493 content_stream_bytes_after as f64 / content_stream_bytes_before as f64
4494 };
4495
4496 Ok(FlattenQualityMetrics {
4497 page_count_before,
4498 page_count_after,
4499 page_count_match: page_count_before == page_count_after,
4500 content_stream_bytes_before,
4501 content_stream_bytes_after,
4502 content_ratio,
4503 })
4504}
4505
4506pub struct TextValidation {
4516 pub expected_values: Vec<String>,
4518 pub found_values: Vec<String>,
4520 pub missing_values: Vec<String>,
4522 pub completeness_ratio: f64,
4525}
4526
4527fn extract_field_values_from_datasets(datasets_xml: &str) -> Vec<String> {
4530 let mut values = Vec::new();
4534 let mut remaining = datasets_xml;
4535
4536 while let Some(open_pos) = remaining.find("<field") {
4537 let tag_end = match remaining[open_pos..].find('>') {
4539 Some(p) => open_pos + p + 1,
4540 None => break,
4541 };
4542
4543 if remaining[open_pos..tag_end].ends_with("/>") {
4545 remaining = &remaining[tag_end..];
4546 continue;
4547 }
4548
4549 let close_tag = "</field>";
4551 match remaining[tag_end..].find(close_tag) {
4552 Some(close_pos) => {
4553 let inner = &remaining[tag_end..tag_end + close_pos];
4554 let text = extract_innermost_text(inner);
4557 if !text.is_empty() {
4558 values.push(text);
4559 }
4560 remaining = &remaining[tag_end + close_pos + close_tag.len()..];
4561 }
4562 None => break,
4563 }
4564 }
4565 values
4566}
4567
4568fn extract_innermost_text(inner: &str) -> String {
4571 if let Some(start) = inner.find("<text>") {
4573 let content_start = start + "<text>".len();
4574 if let Some(end) = inner[content_start..].find("</text>") {
4575 let s = inner[content_start..content_start + end].trim().to_string();
4576 if !s.is_empty() {
4577 return s;
4578 }
4579 }
4580 }
4581 let stripped = strip_xml_tags(inner);
4583 stripped.trim().to_string()
4584}
4585
4586fn strip_xml_tags(s: &str) -> String {
4588 let mut out = String::with_capacity(s.len());
4589 let mut in_tag = false;
4590 for ch in s.chars() {
4591 match ch {
4592 '<' => in_tag = true,
4593 '>' => in_tag = false,
4594 _ if !in_tag => out.push(ch),
4595 _ => {}
4596 }
4597 }
4598 out
4599}
4600
4601fn extract_text_from_pdf_bytes(pdf_bytes: &[u8]) -> String {
4608 let doc = match Document::load_mem(pdf_bytes) {
4609 Ok(d) => d,
4610 Err(_) => return String::new(),
4611 };
4612
4613 let mut text = String::new();
4614
4615 for obj in doc.objects.values() {
4616 if let Object::Stream(ref stream) = obj {
4617 let content = match stream.decompressed_content() {
4619 Ok(c) => c,
4620 Err(_) => stream.content.clone(),
4621 };
4622 let fragment = extract_text_from_content_stream(&content);
4623 if !fragment.is_empty() {
4624 text.push(' ');
4625 text.push_str(&fragment);
4626 }
4627 }
4628 }
4629 text
4630}
4631
4632fn extract_text_from_content_stream(content: &[u8]) -> String {
4635 let s = String::from_utf8_lossy(content);
4636 let mut result = String::new();
4637
4638 for (i, ch) in s.char_indices() {
4641 if ch == '(' {
4642 let start = i + 1;
4644 let mut depth: i32 = 1;
4645 let mut end = start;
4646 let bytes = s.as_bytes();
4647 while end < bytes.len() && depth > 0 {
4648 match bytes[end] {
4649 b'(' => depth += 1,
4650 b')' => depth -= 1,
4651 b'\\' => {
4652 end += 1; }
4654 _ => {}
4655 }
4656 end += 1;
4657 }
4658 if depth == 0 {
4659 let literal = &s[start..end - 1];
4660 if literal.chars().all(|c| {
4662 c.is_ascii()
4663 && (c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation())
4664 }) {
4665 let trimmed = literal.trim();
4666 if !trimmed.is_empty() {
4667 result.push(' ');
4668 result.push_str(trimmed);
4669 }
4670 }
4671 }
4672 }
4673 }
4674 result
4675}
4676
4677pub fn validate_text_completeness(
4690 original_xfa_bytes: &[u8],
4691 flattened_bytes: &[u8],
4692) -> crate::error::Result<TextValidation> {
4693 let packets = match crate::extract::extract_xfa_from_bytes(original_xfa_bytes.to_vec()) {
4695 Ok(p) => p,
4696 Err(_) => {
4697 return Ok(TextValidation {
4699 expected_values: vec![],
4700 found_values: vec![],
4701 missing_values: vec![],
4702 completeness_ratio: 1.0,
4703 });
4704 }
4705 };
4706
4707 let datasets_xml = match packets.datasets() {
4708 Some(ds) => ds.to_string(),
4709 None => {
4710 return Ok(TextValidation {
4711 expected_values: vec![],
4712 found_values: vec![],
4713 missing_values: vec![],
4714 completeness_ratio: 1.0,
4715 });
4716 }
4717 };
4718
4719 let expected_values = extract_field_values_from_datasets(&datasets_xml);
4721
4722 if expected_values.is_empty() {
4723 return Ok(TextValidation {
4724 expected_values: vec![],
4725 found_values: vec![],
4726 missing_values: vec![],
4727 completeness_ratio: 1.0,
4728 });
4729 }
4730
4731 let output_text = extract_text_from_pdf_bytes(flattened_bytes);
4733
4734 let mut found_values = Vec::new();
4736 let mut missing_values = Vec::new();
4737
4738 for value in &expected_values {
4739 if output_text.contains(value.as_str()) {
4740 found_values.push(value.clone());
4741 } else {
4742 missing_values.push(value.clone());
4743 }
4744 }
4745
4746 let completeness_ratio = if expected_values.is_empty() {
4747 1.0
4748 } else {
4749 found_values.len() as f64 / expected_values.len() as f64
4750 };
4751
4752 Ok(TextValidation {
4753 expected_values,
4754 found_values,
4755 missing_values,
4756 completeness_ratio,
4757 })
4758}
4759
4760#[cfg(test)]
4771fn flatten_xfa_to_pdf_simulate_reentrant(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
4772 FLATTEN_DEPTH.with(|d| d.set(1));
4773 let result = flatten_xfa_to_pdf(pdf_bytes);
4774 FLATTEN_DEPTH.with(|d| d.set(0));
4777 result
4778}
4779
4780#[cfg(test)]
4781mod tests {
4782 use super::*;
4783
4784 fn build_xfa_pdf_with_content(xdp: &str, page_content: Vec<u8>) -> Vec<u8> {
4786 use lopdf::{dictionary, Document, Object, Stream};
4787 let mut doc = Document::with_version("1.4");
4788 let xdp_bytes = xdp.as_bytes().to_vec();
4789 let xfa_stream = Stream::new(
4790 dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4791 xdp_bytes,
4792 );
4793 let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4794 let pages_id = doc.new_object_id();
4795 let content_stream = Stream::new(
4796 dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4797 page_content,
4798 );
4799 let content_id = doc.add_object(Object::Stream(content_stream));
4800 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4801 "Type" => Object::Name(b"Page".to_vec()),
4802 "Parent" => Object::Reference(pages_id),
4803 "MediaBox" => Object::Array(vec![
4804 Object::Integer(0), Object::Integer(0),
4805 Object::Integer(612), Object::Integer(792),
4806 ]),
4807 "Contents" => Object::Reference(content_id)
4808 }));
4809 doc.objects.insert(
4810 pages_id,
4811 Object::Dictionary(dictionary! {
4812 "Type" => Object::Name(b"Pages".to_vec()),
4813 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4814 "Count" => Object::Integer(1)
4815 }),
4816 );
4817 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4818 "XFA" => Object::Reference(xfa_id),
4819 "Fields" => Object::Array(vec![])
4820 }));
4821 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4822 "Type" => Object::Name(b"Catalog".to_vec()),
4823 "Pages" => Object::Reference(pages_id),
4824 "AcroForm" => Object::Reference(acroform_id)
4825 }));
4826 doc.trailer.set("Root", Object::Reference(catalog_id));
4827 let mut out = Vec::new();
4828 doc.save_to(&mut out).unwrap();
4829 out
4830 }
4831
4832 fn build_xfa_pdf(xdp: &str) -> Vec<u8> {
4833 build_xfa_pdf_with_content(xdp, Vec::new())
4834 }
4835
4836 fn build_xfa_doc_with_xfa_array() -> (Document, ObjectId, Vec<ObjectId>) {
4837 use lopdf::{dictionary, Document, Object, Stream};
4838
4839 let mut doc = Document::with_version("1.4");
4840 let pages_id = doc.new_object_id();
4841 let content_id = doc.add_object(Object::Stream(Stream::new(
4842 dictionary! { "Length" => Object::Integer(0) },
4843 Vec::new(),
4844 )));
4845 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4846 "Type" => Object::Name(b"Page".to_vec()),
4847 "Parent" => Object::Reference(pages_id),
4848 "MediaBox" => Object::Array(vec![
4849 Object::Integer(0), Object::Integer(0),
4850 Object::Integer(612), Object::Integer(792),
4851 ]),
4852 "Contents" => Object::Reference(content_id)
4853 }));
4854 doc.objects.insert(
4855 pages_id,
4856 Object::Dictionary(dictionary! {
4857 "Type" => Object::Name(b"Pages".to_vec()),
4858 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4859 "Count" => Object::Integer(1)
4860 }),
4861 );
4862
4863 let packet_payloads = [
4864 (
4865 b"xdp:xdp".to_vec(),
4866 br#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#.to_vec(),
4867 ),
4868 (
4869 b"template".to_vec(),
4870 br#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform/></template>"#
4871 .to_vec(),
4872 ),
4873 (
4874 b"datasets".to_vec(),
4875 br#"<xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"></xfa:datasets>"#
4876 .to_vec(),
4877 ),
4878 ];
4879
4880 let mut xfa_array = Vec::new();
4881 let mut xfa_ids = Vec::new();
4882 for (packet_name, payload) in packet_payloads {
4883 let stream_id = doc.add_object(Object::Stream(Stream::new(
4884 dictionary! { "Length" => Object::Integer(payload.len() as i64) },
4885 payload,
4886 )));
4887 xfa_array.push(Object::Name(packet_name));
4888 xfa_array.push(Object::Reference(stream_id));
4889 xfa_ids.push(stream_id);
4890 }
4891
4892 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4893 "XFA" => Object::Array(xfa_array),
4894 "Fields" => Object::Array(vec![])
4895 }));
4896 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4897 "Type" => Object::Name(b"Catalog".to_vec()),
4898 "Pages" => Object::Reference(pages_id),
4899 "AcroForm" => Object::Reference(acroform_id)
4900 }));
4901 doc.trailer.set("Root", Object::Reference(catalog_id));
4902 (doc, acroform_id, xfa_ids)
4903 }
4904
4905 fn build_xfa_pdf_with_widget_appearance(
4906 page_content: Vec<u8>,
4907 normal_appearance: Object,
4908 widget_extra: Dictionary,
4909 ) -> Vec<u8> {
4910 use lopdf::{dictionary, Document, Object, Stream};
4911
4912 let mut doc = Document::with_version("1.4");
4913 let xdp_bytes = SIMPLE_XDP.as_bytes().to_vec();
4914 let xfa_stream = Stream::new(
4915 dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4916 xdp_bytes,
4917 );
4918 let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4919
4920 let pages_id = doc.new_object_id();
4921 let content_id = doc.add_object(Object::Stream(Stream::new(
4922 dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4923 page_content,
4924 )));
4925
4926 let appearance_id = match normal_appearance {
4927 Object::Reference(id) => id,
4928 other => doc.add_object(other),
4929 };
4930
4931 let widget_id = doc.new_object_id();
4932 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4933 "Type" => Object::Name(b"Page".to_vec()),
4934 "Parent" => Object::Reference(pages_id),
4935 "MediaBox" => Object::Array(vec![
4936 Object::Integer(0), Object::Integer(0),
4937 Object::Integer(612), Object::Integer(792),
4938 ]),
4939 "Contents" => Object::Reference(content_id),
4940 "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
4941 "Resources" => Object::Dictionary(dictionary! {})
4942 }));
4943
4944 let mut widget = dictionary! {
4945 "Type" => Object::Name(b"Annot".to_vec()),
4946 "Subtype" => Object::Name(b"Widget".to_vec()),
4947 "Rect" => Object::Array(vec![
4948 Object::Integer(100), Object::Integer(700),
4949 Object::Integer(220), Object::Integer(730),
4950 ]),
4951 "AP" => Object::Dictionary(dictionary! {
4952 "N" => Object::Reference(appearance_id)
4953 }),
4954 "P" => Object::Reference(page_id)
4955 };
4956 for (key, value) in widget_extra {
4957 widget.set(key, value);
4958 }
4959 doc.objects.insert(widget_id, Object::Dictionary(widget));
4960
4961 doc.objects.insert(
4962 pages_id,
4963 Object::Dictionary(dictionary! {
4964 "Type" => Object::Name(b"Pages".to_vec()),
4965 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4966 "Count" => Object::Integer(1)
4967 }),
4968 );
4969
4970 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4971 "XFA" => Object::Reference(xfa_id),
4972 "Fields" => Object::Array(vec![Object::Reference(widget_id)])
4973 }));
4974 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4975 "Type" => Object::Name(b"Catalog".to_vec()),
4976 "Pages" => Object::Reference(pages_id),
4977 "AcroForm" => Object::Reference(acroform_id)
4978 }));
4979 doc.trailer.set("Root", Object::Reference(catalog_id));
4980
4981 let mut out = Vec::new();
4982 doc.save_to(&mut out).unwrap();
4983 out
4984 }
4985
4986 #[allow(dead_code)]
4987 fn find_last_content_stream(doc: &Document, page_id: ObjectId) -> &Stream {
4988 let page_dict = doc.get_dictionary(page_id).expect("page dict");
4989 match page_dict.get(b"Contents").expect("contents") {
4990 Object::Reference(id) => doc
4991 .get_object(*id)
4992 .expect("contents object")
4993 .as_stream()
4994 .expect("contents stream"),
4995 Object::Array(arr) => {
4996 let last = arr.last().expect("last content stream");
4997 let id = last.as_reference().expect("contents ref");
4998 doc.get_object(id)
4999 .expect("contents object")
5000 .as_stream()
5001 .expect("contents stream")
5002 }
5003 other => other.as_stream().expect("contents stream"),
5004 }
5005 }
5006
5007 #[allow(dead_code)]
5008 fn page_xobjects(doc: &Document, page_id: ObjectId) -> Dictionary {
5009 let page_dict = doc.get_dictionary(page_id).expect("page dict");
5010 let resources = page_dict
5011 .get(b"Resources")
5012 .expect("resources")
5013 .as_dict()
5014 .expect("resources dict");
5015 resources
5016 .get(b"XObject")
5017 .expect("xobjects")
5018 .as_dict()
5019 .expect("xobject dict")
5020 .clone()
5021 }
5022
5023 #[test]
5024 fn append_to_page_content_flattens_indirect_contents_arrays() {
5025 let mut doc = Document::with_version("1.4");
5026 let pages_id = doc.new_object_id();
5027 let first_stream_id = doc.add_object(Stream::new(dictionary! {}, b"q\n".to_vec()));
5028 let second_stream_id = doc.add_object(Stream::new(dictionary! {}, b"Q\n".to_vec()));
5029 let contents_array_id = doc.add_object(Object::Array(vec![
5030 Object::Reference(first_stream_id),
5031 Object::Reference(second_stream_id),
5032 ]));
5033 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5034 "Type" => Object::Name(b"Page".to_vec()),
5035 "Parent" => Object::Reference(pages_id),
5036 "MediaBox" => Object::Array(vec![
5037 Object::Integer(0), Object::Integer(0),
5038 Object::Integer(612), Object::Integer(792),
5039 ]),
5040 "Contents" => Object::Reference(contents_array_id),
5041 }));
5042 doc.objects.insert(
5043 pages_id,
5044 Object::Dictionary(dictionary! {
5045 "Type" => Object::Name(b"Pages".to_vec()),
5046 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5047 "Count" => Object::Integer(1),
5048 }),
5049 );
5050
5051 append_to_page_content(&mut doc, page_id, b"BT\nET\n");
5052
5053 let page_dict = doc.get_dictionary(page_id).expect("page dict");
5054 let contents = page_dict.get(b"Contents").expect("contents");
5055 let items = contents.as_array().expect("flattened contents array");
5056
5057 assert_eq!(items.len(), 3, "existing streams + appended stream");
5058 assert!(
5059 items.iter().all(|obj| obj.as_reference().is_ok()),
5060 "contents array must stay flat and reference only streams"
5061 );
5062 for object in items {
5063 let stream_id = object.as_reference().expect("stream ref");
5064 assert!(
5065 doc.get_object(stream_id)
5066 .expect("stream object")
5067 .as_stream()
5068 .is_ok(),
5069 "nested arrays must not survive in page contents"
5070 );
5071 }
5072 }
5073
5074 const SIMPLE_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
5075<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5076<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5077 <subform name="form1" layout="paginate">
5078 <pageSet>
5079 <pageArea name="Page1">
5080 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5081 <medium stock="default" short="8.5in" long="11in"/>
5082 </pageArea>
5083 </pageSet>
5084 <subform name="section" layout="tb" w="7.5in">
5085 <field name="firstName" w="3.5in" h="0.3in">
5086 <caption><value><text>First Name</text></value></caption>
5087 <ui><textEdit/></ui>
5088 <value><text>John</text></value>
5089 </field>
5090 </subform>
5091 </subform>
5092</template>
5093</xdp:xdp>"#;
5094
5095 const JS_EVENT_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
5096<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5097<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5098 <subform name="form1" layout="paginate">
5099 <pageSet>
5100 <pageArea name="Page1">
5101 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5102 <medium stock="default" short="8.5in" long="11in"/>
5103 </pageArea>
5104 </pageSet>
5105 <subform name="section" layout="tb" w="7.5in">
5106 <event activity="initialize">
5107 <script contentType="application/x-javascript">app.alert('blocked');</script>
5108 </event>
5109 <field name="firstName" w="3.5in" h="0.3in">
5110 <caption><value><text>First Name</text></value></caption>
5111 <ui><textEdit/></ui>
5112 <value><text>John</text></value>
5113 </field>
5114 </subform>
5115 </subform>
5116</template>
5117</xdp:xdp>"#;
5118
5119 fn overflowing_paginate_xdp(base_profile: Option<&str>) -> String {
5120 let mut fields = String::new();
5121 for i in 0..40 {
5122 fields.push_str(&format!(
5123 r#"
5124 <field name="line{i}" w="7.0in" h="0.3in">
5125 <ui><textEdit/></ui>
5126 <value><text>Line {i}</text></value>
5127 </field>"#
5128 ));
5129 }
5130
5131 let base_profile_attr = base_profile
5132 .map(|value| format!(r#" baseProfile="{value}""#))
5133 .unwrap_or_default();
5134
5135 format!(
5136 r#"<?xml version="1.0" encoding="UTF-8"?>
5137<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5138<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"{base_profile_attr}>
5139 <subform name="form1" layout="paginate">
5140 <pageSet>
5141 <pageArea name="Page1">
5142 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5143 <medium stock="default" short="8.5in" long="11in"/>
5144 </pageArea>
5145 </pageSet>
5146 <subform name="section" layout="tb" w="7.5in">{fields}
5147 </subform>
5148 </subform>
5149</template>
5150</xdp:xdp>"#
5151 )
5152 }
5153
5154 #[test]
5155 fn flatten_simple_form_produces_non_empty_content() {
5156 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5157 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5158
5159 let doc = Document::load_mem(&result).expect("load flattened PDF");
5161 let pages: Vec<ObjectId> = doc.page_iter().collect();
5162 assert!(!pages.is_empty(), "flattened PDF has no pages");
5163
5164 let mut found_content = false;
5166 for page_id in &pages {
5167 if let Ok(page_dict) = doc.get_dictionary(*page_id) {
5168 if let Ok(Object::Reference(stream_id)) = page_dict.get(b"Contents") {
5169 if let Ok(obj) = doc.get_object(*stream_id) {
5170 if let Ok(stream) = obj.as_stream() {
5171 if !stream.content.is_empty() {
5172 found_content = true;
5173 }
5174 }
5175 }
5176 }
5177 }
5178 }
5179 assert!(found_content, "all content streams are empty after flatten");
5180 }
5181
5182 #[test]
5183 fn flatten_reports_best_effort_for_xfa_javascript_event() {
5184 let pdf_bytes = build_xfa_pdf(JS_EVENT_XDP);
5185
5186 let (flattened, metadata) =
5187 flatten_xfa_to_pdf_with_metadata(&pdf_bytes).expect("flatten should skip JS");
5188
5189 assert!(!flattened.is_empty());
5190 assert_eq!(metadata.output_quality, OutputQuality::BestEffort);
5191 assert!(metadata.dynamic_scripts.js_present);
5192 assert_eq!(metadata.dynamic_scripts.js_skipped, 1);
5193 }
5194
5195 #[test]
5196 fn flatten_strips_catalog_open_action_javascript() {
5197 let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5198 {
5199 let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5200 let root_id = match doc.trailer.get(b"Root") {
5201 Ok(Object::Reference(id)) => *id,
5202 _ => panic!("no Root in test PDF"),
5203 };
5204 if let Ok(Object::Dictionary(catalog)) = doc.get_object_mut(root_id) {
5205 catalog.set(
5206 "OpenAction",
5207 Object::Dictionary(dictionary! {
5208 "S" => Object::Name(b"JavaScript".to_vec()),
5209 "JS" => Object::String(
5210 b"app.alert('blocked')".to_vec(),
5211 lopdf::StringFormat::Literal,
5212 ),
5213 }),
5214 );
5215 }
5216 let mut out = Vec::new();
5217 doc.save_to(&mut out).expect("save test PDF");
5218 pdf_bytes = out;
5219 }
5220
5221 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5222 let doc = Document::load_mem(&flattened).expect("load flattened PDF");
5223 let root_id = match doc.trailer.get(b"Root") {
5224 Ok(Object::Reference(id)) => *id,
5225 _ => panic!("no Root in flattened PDF"),
5226 };
5227 let catalog = doc.get_dictionary(root_id).expect("catalog dict");
5228 assert!(
5229 catalog.get(b"OpenAction").is_err(),
5230 "/OpenAction JavaScript must be stripped from flattened output"
5231 );
5232 }
5233
5234 #[test]
5240 fn flatten_paginate_subform_with_nested_pageset_produces_visible_content() {
5241 const LR_TB_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
5242<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
5243<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5244 <subform name="form1" layout="paginate" locale="en_US">
5245 <pageSet>
5246 <pageArea name="Page1" id="Page1">
5247 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
5248 <medium stock="default" short="8.5in" long="11in"/>
5249 </pageArea>
5250 </pageSet>
5251 <subform name="row1" layout="lr-tb" w="7.5in" h="0.4in">
5252 <field name="firstName" w="3.5in" h="0.4in">
5253 <caption><value><text>First</text></value></caption>
5254 <ui><textEdit/></ui>
5255 <value><text>John</text></value>
5256 </field>
5257 <field name="lastName" w="3.5in" h="0.4in">
5258 <caption><value><text>Last</text></value></caption>
5259 <ui><textEdit/></ui>
5260 <value><text>Doe</text></value>
5261 </field>
5262 </subform>
5263 </subform>
5264</template>
5265</xdp:xdp>"#;
5266
5267 let pdf_bytes = build_xfa_pdf(LR_TB_XDP);
5268 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5269
5270 let doc = Document::load_mem(&result).expect("load flattened PDF");
5271 let pages: Vec<ObjectId> = doc.page_iter().collect();
5272
5273 assert_eq!(pages.len(), 1, "expected 1 page, got {}", pages.len());
5275
5276 if let Ok(page_dict) = doc.get_dictionary(pages[0]) {
5279 if let Ok(lopdf::Object::Reference(stream_id)) = page_dict.get(b"Contents") {
5280 if let Ok(obj) = doc.get_object(*stream_id) {
5281 if let Ok(stream) = obj.as_stream() {
5282 let content = String::from_utf8_lossy(&stream.content);
5283 assert!(
5284 content.contains("BT\n"),
5285 "no text operators in page 1 content stream (should have BT from field values)"
5286 );
5287 assert!(
5288 content.contains("Tj\n"),
5289 "no text show operators in page 1 content stream"
5290 );
5291 }
5292 }
5293 }
5294 }
5295 }
5296
5297 #[test]
5298 fn static_single_page_pdf_does_not_append_xfa_overflow_pages() {
5299 let xdp = overflowing_paginate_xdp(Some("interactiveForms"));
5300 let pdf_bytes = build_xfa_pdf(&xdp);
5301 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5302
5303 let doc = Document::load_mem(&result).expect("load flattened PDF");
5304 let pages: Vec<ObjectId> = doc.page_iter().collect();
5305
5306 assert_eq!(
5307 pages.len(),
5308 1,
5309 "static 1-page PDFs should preserve the original page when XFA layout over-paginates"
5310 );
5311 }
5312
5313 #[test]
5314 fn dynamic_single_page_pdf_can_expand_beyond_original_page_count() {
5315 let xdp = overflowing_paginate_xdp(None);
5320 let pdf_bytes = build_xfa_pdf(&xdp);
5321 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5322
5323 let doc = Document::load_mem(&result).expect("load flattened PDF");
5324 let pages: Vec<ObjectId> = doc.page_iter().collect();
5325
5326 assert_eq!(
5327 pages.len(),
5328 2,
5329 "dynamic 1-page PDFs should be allowed to grow when XFA layout paginates"
5330 );
5331 }
5332
5333 #[test]
5334 fn flatten_removes_acroform() {
5335 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5336 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5337 let doc = Document::load_mem(&result).expect("load flattened PDF");
5338 let root_id = doc.trailer.get(b"Root").unwrap().as_reference().unwrap();
5339 let catalog = doc.get_dictionary(root_id).unwrap();
5340 assert!(
5341 catalog.get(b"AcroForm").is_err(),
5342 "/AcroForm still present after flatten"
5343 );
5344 }
5345
5346 #[test]
5347 fn flatten_non_xfa_pdf_unchanged() {
5348 let mut doc = Document::with_version("1.4");
5350 let pages_id = doc.new_object_id();
5351 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5352 "Type" => Object::Name(b"Page".to_vec()),
5353 "Parent" => Object::Reference(pages_id),
5354 "MediaBox" => Object::Array(vec![
5355 Object::Integer(0), Object::Integer(0),
5356 Object::Integer(612), Object::Integer(792),
5357 ])
5358 }));
5359 doc.objects.insert(
5360 pages_id,
5361 Object::Dictionary(dictionary! {
5362 "Type" => Object::Name(b"Pages".to_vec()),
5363 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5364 "Count" => Object::Integer(1)
5365 }),
5366 );
5367 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5368 "Type" => Object::Name(b"Catalog".to_vec()),
5369 "Pages" => Object::Reference(pages_id)
5370 }));
5371 doc.trailer.set("Root", Object::Reference(catalog_id));
5372 let mut raw = Vec::new();
5373 doc.save_to(&mut raw).unwrap();
5374
5375 let result = flatten_xfa_to_pdf(&raw).expect("flatten non-XFA failed");
5377 assert!(!result.is_empty());
5378 }
5379
5380 #[test]
5381 fn placeholder_only_page_does_not_trigger_static_strip_path() {
5382 const PLACEHOLDER_STREAM: &str = r#"BT
5383/Helv 24 Tf
538472 720 Td
5385(Please wait...) Tj
53860 -32 Td
5387(If this message is not eventually replaced by the proper contents of the document,) Tj
53880 -32 Td
5389(your PDF viewer may not be able to display this type of document.) Tj
53900 -32 Td
5391(You can upgrade to the latest version of Adobe Reader by visiting reader_download.) Tj
5392ET
5393"#;
5394
5395 let pdf_bytes =
5396 build_xfa_pdf_with_content(SIMPLE_XDP, PLACEHOLDER_STREAM.as_bytes().to_vec());
5397 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5398
5399 let doc = Document::load_mem(&result).expect("load flattened PDF");
5400 let page_id = doc.page_iter().next().expect("flattened page");
5401 let page_dict = doc.get_dictionary(page_id).expect("page dict");
5402 let contents_id = page_dict
5403 .get(b"Contents")
5404 .ok()
5405 .and_then(|object| object.as_reference().ok())
5406 .expect("contents ref");
5407 let stream = doc
5408 .get_object(contents_id)
5409 .expect("contents object")
5410 .as_stream()
5411 .expect("contents stream");
5412 let content = String::from_utf8_lossy(&stream.content);
5413
5414 assert!(
5415 content.contains("John"),
5416 "flattened page should contain XFA-rendered field content"
5417 );
5418 assert!(
5419 !content.contains("Please wait"),
5420 "placeholder text should not survive XFA flattening"
5421 );
5422 }
5423
5424 #[test]
5425 fn hybrid_static_pdf_uses_xfa_layout_over_static_content() {
5426 let appearance = Object::Stream(Stream::new(
5430 dictionary! {
5431 "Type" => Object::Name(b"XObject".to_vec()),
5432 "Subtype" => Object::Name(b"Form".to_vec()),
5433 "BBox" => Object::Array(vec![
5434 Object::Integer(0), Object::Integer(0),
5435 Object::Integer(120), Object::Integer(30),
5436 ]),
5437 "Matrix" => Object::Array(vec![
5438 Object::Integer(1), Object::Integer(0),
5439 Object::Integer(0), Object::Integer(1),
5440 Object::Integer(0), Object::Integer(0),
5441 ]),
5442 "Resources" => Object::Dictionary(dictionary! {}),
5443 },
5444 b"0 G\n0.5 0.5 119 29 re\ns\n".to_vec(),
5445 ));
5446 let page_content = b"BT /F1 12 Tf 72 720 Td (Line 1) Tj 0 -14 Td (Line 2) Tj 0 -14 Td (Line 3) Tj 0 -14 Td (Line 4) Tj 0 -14 Td (Line 5) Tj ET\n".to_vec();
5448 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5449 page_content,
5450 appearance,
5451 dictionary! {
5452 "FT" => Object::Name(b"Tx".to_vec()),
5453 "T" => Object::string_literal("field[0]"),
5454 },
5455 );
5456
5457 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5458 let doc = Document::load_mem(&result).expect("load flattened PDF");
5459 let page_id = doc.page_iter().next().expect("page");
5460 let page_dict = doc.get_dictionary(page_id).expect("page dict");
5461
5462 assert!(
5464 page_dict.get(b"Annots").is_err(),
5465 "XFA-flattened page should have no annotations"
5466 );
5467 }
5468
5469 #[test]
5470 fn hybrid_static_pdf_uses_selected_button_appearance_state() {
5471 let yes_stream = Object::Stream(Stream::new(
5472 dictionary! {
5473 "Type" => Object::Name(b"XObject".to_vec()),
5474 "Subtype" => Object::Name(b"Form".to_vec()),
5475 "BBox" => Object::Array(vec![
5476 Object::Integer(0), Object::Integer(0),
5477 Object::Integer(20), Object::Integer(20),
5478 ]),
5479 "Matrix" => Object::Array(vec![
5480 Object::Integer(1), Object::Integer(0),
5481 Object::Integer(0), Object::Integer(1),
5482 Object::Integer(0), Object::Integer(0),
5483 ]),
5484 "Resources" => Object::Dictionary(dictionary! {}),
5485 },
5486 b"BT /F1 8 Tf 1 1 Td (YES) Tj ET\n".to_vec(),
5487 ));
5488 let off_stream = Object::Stream(Stream::new(
5489 dictionary! {
5490 "Type" => Object::Name(b"XObject".to_vec()),
5491 "Subtype" => Object::Name(b"Form".to_vec()),
5492 "BBox" => Object::Array(vec![
5493 Object::Integer(0), Object::Integer(0),
5494 Object::Integer(20), Object::Integer(20),
5495 ]),
5496 "Matrix" => Object::Array(vec![
5497 Object::Integer(1), Object::Integer(0),
5498 Object::Integer(0), Object::Integer(1),
5499 Object::Integer(0), Object::Integer(0),
5500 ]),
5501 "Resources" => Object::Dictionary(dictionary! {}),
5502 },
5503 b"BT /F1 8 Tf 1 1 Td (OFF) Tj ET\n".to_vec(),
5504 ));
5505
5506 let mut doc = Document::with_version("1.4");
5507 let state_id = doc.add_object(Object::Dictionary(dictionary! {
5508 "Yes" => yes_stream,
5509 "Off" => off_stream,
5510 }));
5511 let annot = dictionary! {
5512 "Subtype" => Object::Name(b"Widget".to_vec()),
5513 "Rect" => Object::Array(vec![
5514 Object::Integer(100), Object::Integer(700),
5515 Object::Integer(120), Object::Integer(720),
5516 ]),
5517 "AP" => Object::Dictionary(dictionary! {
5518 "N" => Object::Reference(state_id),
5519 }),
5520 "AS" => Object::Name(b"Yes".to_vec()),
5521 "FT" => Object::Name(b"Btn".to_vec()),
5522 };
5523 let ap_id =
5524 resolve_widget_normal_appearance(&mut doc, &annot).expect("selected normal appearance");
5525 let stream = doc
5526 .get_object(ap_id)
5527 .expect("appearance stream")
5528 .as_stream()
5529 .expect("appearance stream");
5530 let content = String::from_utf8_lossy(&stream.content);
5531
5532 assert!(
5533 content.contains("YES"),
5534 "flatten should choose the selected normal appearance state"
5535 );
5536 }
5537
5538 #[test]
5539 fn widget_as_off_without_off_appearance_returns_none() {
5540 let yes_stream = Object::Stream(Stream::new(
5544 dictionary! {
5545 "Type" => Object::Name(b"XObject".to_vec()),
5546 "Subtype" => Object::Name(b"Form".to_vec()),
5547 "BBox" => Object::Array(vec![
5548 Object::Integer(0), Object::Integer(0),
5549 Object::Integer(10), Object::Integer(10),
5550 ]),
5551 },
5552 b"q 5 5 m 5 5 l S Q\n".to_vec(),
5553 ));
5554
5555 let mut doc = Document::with_version("1.4");
5556 let state_id = doc.add_object(Object::Dictionary(dictionary! {
5558 "0" => yes_stream,
5559 }));
5560 let annot = dictionary! {
5561 "Subtype" => Object::Name(b"Widget".to_vec()),
5562 "Rect" => Object::Array(vec![
5563 Object::Integer(100), Object::Integer(700),
5564 Object::Integer(110), Object::Integer(710),
5565 ]),
5566 "AP" => Object::Dictionary(dictionary! {
5567 "N" => Object::Reference(state_id),
5568 }),
5569 "AS" => Object::Name(b"Off".to_vec()),
5570 "FT" => Object::Name(b"Btn".to_vec()),
5571 };
5572 assert!(
5573 resolve_widget_normal_appearance(&mut doc, &annot).is_none(),
5574 "Off state with no Off appearance should not resolve to the on-state stream"
5575 );
5576 }
5577
5578 #[test]
5579 fn bake_checkbox_radio_ap_marks_skips_off_widgets_without_off_normal_appearance() {
5580 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5581 Vec::new(),
5582 Object::Dictionary(dictionary! {
5583 "1" => Object::Stream(Stream::new(
5584 dictionary! {
5585 "Type" => Object::Name(b"XObject".to_vec()),
5586 "Subtype" => Object::Name(b"Form".to_vec()),
5587 "BBox" => Object::Array(vec![
5588 Object::Integer(0), Object::Integer(0),
5589 Object::Integer(10), Object::Integer(10),
5590 ]),
5591 "Resources" => Object::Dictionary(dictionary! {}),
5592 },
5593 b"q 1 1 8 8 re W n 2 8 m 8 2 l 8 8 m 2 2 l s Q\n".to_vec(),
5594 )),
5595 }),
5596 dictionary! {
5597 "FT" => Object::Name(b"Btn".to_vec()),
5598 "AS" => Object::Name(b"Off".to_vec()),
5599 "T" => Object::string_literal("checkbox[0]"),
5600 },
5601 );
5602
5603 let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5604 let page_id = doc.page_iter().next().expect("page");
5605 let baked = bake_checkbox_radio_ap_marks(&mut doc, page_id);
5606
5607 assert_eq!(baked, 0, "Off-state widget must not stamp the on-mark");
5608 }
5609
5610 #[test]
5611 fn adding_widget_xobject_preserves_indirect_inline_page_xobjects() {
5612 let mut doc = Document::with_version("1.4");
5613 let existing_xobject_id = doc.add_object(Object::Stream(Stream::new(
5614 dictionary! {
5615 "Type" => Object::Name(b"XObject".to_vec()),
5616 "Subtype" => Object::Name(b"Form".to_vec()),
5617 "BBox" => Object::Array(vec![
5618 Object::Integer(0), Object::Integer(0),
5619 Object::Integer(10), Object::Integer(10),
5620 ]),
5621 },
5622 b"q Q\n".to_vec(),
5623 )));
5624 let xobject_dict_id = doc.add_object(Object::Dictionary(dictionary! {
5625 "R11" => Object::Reference(existing_xobject_id),
5626 }));
5627
5628 let pages_id = doc.new_object_id();
5629 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5630 "Type" => Object::Name(b"Page".to_vec()),
5631 "Parent" => Object::Reference(pages_id),
5632 "MediaBox" => Object::Array(vec![
5633 Object::Integer(0), Object::Integer(0),
5634 Object::Integer(612), Object::Integer(792),
5635 ]),
5636 "Resources" => Object::Dictionary(dictionary! {
5637 "XObject" => Object::Reference(xobject_dict_id),
5638 }),
5639 }));
5640 doc.objects.insert(
5641 pages_id,
5642 Object::Dictionary(dictionary! {
5643 "Type" => Object::Name(b"Pages".to_vec()),
5644 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5645 "Count" => Object::Integer(1)
5646 }),
5647 );
5648
5649 let new_xobject_id = doc.add_object(Object::Stream(Stream::new(
5650 dictionary! {
5651 "Type" => Object::Name(b"XObject".to_vec()),
5652 "Subtype" => Object::Name(b"Form".to_vec()),
5653 "BBox" => Object::Array(vec![
5654 Object::Integer(0), Object::Integer(0),
5655 Object::Integer(10), Object::Integer(10),
5656 ]),
5657 },
5658 b"0 0 10 10 re S\n".to_vec(),
5659 )));
5660
5661 add_xobject_to_page_resources(&mut doc, page_id, "XfaAp0", new_xobject_id);
5662
5663 let xobjects = doc
5664 .get_object(xobject_dict_id)
5665 .expect("xobject dict")
5666 .as_dict()
5667 .expect("xobject dict");
5668 assert!(
5669 xobjects.get(b"R11").is_ok(),
5670 "existing page XObject was lost"
5671 );
5672 assert!(
5673 xobjects.get(b"XfaAp0").is_ok(),
5674 "new flattened widget XObject was not added"
5675 );
5676 }
5677
5678 #[test]
5679 fn encrypted_pdf_without_xfa_returns_ok() {
5680 let mut doc = Document::with_version("1.4");
5682 let pages_id = doc.new_object_id();
5683 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5684 "Type" => Object::Name(b"Page".to_vec()),
5685 "Parent" => Object::Reference(pages_id),
5686 "MediaBox" => Object::Array(vec![
5687 Object::Integer(0), Object::Integer(0),
5688 Object::Integer(612), Object::Integer(792),
5689 ]),
5690 }));
5691 doc.objects.insert(
5692 pages_id,
5693 Object::Dictionary(dictionary! {
5694 "Type" => Object::Name(b"Pages".to_vec()),
5695 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5696 "Count" => Object::Integer(1),
5697 }),
5698 );
5699 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5700 "Type" => Object::Name(b"Catalog".to_vec()),
5701 "Pages" => Object::Reference(pages_id),
5702 }));
5703 doc.trailer.set("Root", Object::Reference(catalog_id));
5704
5705 let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5706 "Filter" => Object::Name(b"Standard".to_vec()),
5707 "V" => Object::Integer(2),
5708 "Length" => Object::Integer(128),
5709 }));
5710 doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5711
5712 let mut buf = Vec::new();
5713 doc.save_to(&mut buf).expect("save test PDF");
5714
5715 let result = flatten_xfa_to_pdf(&buf);
5716 assert!(result.is_ok(), "non-XFA encrypted PDF should return Ok");
5717 }
5718
5719 #[test]
5720 fn encrypted_xfa_pdf_returns_encrypted_error() {
5721 let mut doc = Document::with_version("1.4");
5724 let pages_id = doc.new_object_id();
5725 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5726 "Type" => Object::Name(b"Page".to_vec()),
5727 "Parent" => Object::Reference(pages_id),
5728 "MediaBox" => Object::Array(vec![
5729 Object::Integer(0), Object::Integer(0),
5730 Object::Integer(612), Object::Integer(792),
5731 ]),
5732 }));
5733 doc.objects.insert(
5734 pages_id,
5735 Object::Dictionary(dictionary! {
5736 "Type" => Object::Name(b"Pages".to_vec()),
5737 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5738 "Count" => Object::Integer(1),
5739 }),
5740 );
5741 let xfa_stream_id = doc.add_object(Object::Stream(lopdf::Stream::new(
5743 dictionary! {},
5744 b"<xdp:xdp></xdp:xdp>".to_vec(),
5745 )));
5746 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
5747 "XFA" => Object::Reference(xfa_stream_id),
5748 }));
5749 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5750 "Type" => Object::Name(b"Catalog".to_vec()),
5751 "Pages" => Object::Reference(pages_id),
5752 "AcroForm" => Object::Reference(acroform_id),
5753 }));
5754 doc.trailer.set("Root", Object::Reference(catalog_id));
5755
5756 let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5757 "Filter" => Object::Name(b"Standard".to_vec()),
5758 "V" => Object::Integer(2),
5759 "Length" => Object::Integer(128),
5760 }));
5761 doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5762
5763 let mut buf = Vec::new();
5764 doc.save_to(&mut buf).expect("save encrypted PDF");
5765
5766 let result = flatten_xfa_to_pdf(&buf);
5767 assert!(result.is_err(), "expected Encrypted error");
5768 let err = result.unwrap_err();
5769 assert!(
5770 matches!(err, XfaError::Encrypted(_)),
5771 "expected XfaError::Encrypted, got: {err:?}"
5772 );
5773 }
5774
5775 #[test]
5776 fn owner_only_encrypted_pdf_is_handled_transparently() {
5777 let mut doc = Document::with_version("2.0");
5780 let pages_id = doc.new_object_id();
5781 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5782 "Type" => Object::Name(b"Page".to_vec()),
5783 "Parent" => Object::Reference(pages_id),
5784 "MediaBox" => Object::Array(vec![
5785 Object::Integer(0), Object::Integer(0),
5786 Object::Integer(612), Object::Integer(792),
5787 ]),
5788 }));
5789 doc.objects.insert(
5790 pages_id,
5791 Object::Dictionary(dictionary! {
5792 "Type" => Object::Name(b"Pages".to_vec()),
5793 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5794 "Count" => Object::Integer(1),
5795 }),
5796 );
5797 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5798 "Type" => Object::Name(b"Catalog".to_vec()),
5799 "Pages" => Object::Reference(pages_id),
5800 }));
5801 doc.trailer.set("Root", Object::Reference(catalog_id));
5802
5803 let state = lopdf::aes256_encryption_state("secret", "", lopdf::Permissions::default())
5805 .expect("create encryption state");
5806 doc.encrypt(&state).expect("encrypt document");
5807
5808 let mut buf = Vec::new();
5809 doc.save_to(&mut buf).expect("save encrypted PDF");
5810
5811 assert!(
5813 !is_pdf_encrypted(&buf),
5814 "lopdf should auto-decrypt owner-only PDFs"
5815 );
5816
5817 let result = flatten_xfa_to_pdf(&buf);
5819 assert!(
5820 result.is_ok(),
5821 "owner-only encrypted PDF should be handled, got: {result:?}"
5822 );
5823 }
5824
5825 fn build_pdf_with_cid_font(w_array: Vec<Object>, dw: Option<i64>) -> Document {
5827 let mut doc = Document::with_version("1.4");
5828
5829 let mut cid_dict = dictionary! {
5831 "Type" => Object::Name(b"Font".to_vec()),
5832 "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
5833 "BaseFont" => Object::Name(b"TestFont".to_vec()),
5834 "W" => Object::Array(w_array)
5835 };
5836 if let Some(dw_val) = dw {
5837 cid_dict.set("DW", Object::Integer(dw_val));
5838 }
5839 let cid_id = doc.add_object(Object::Dictionary(cid_dict));
5840
5841 let type0_dict = dictionary! {
5843 "Type" => Object::Name(b"Font".to_vec()),
5844 "Subtype" => Object::Name(b"Type0".to_vec()),
5845 "BaseFont" => Object::Name(b"TestFont".to_vec()),
5846 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_id)])
5847 };
5848 doc.add_object(Object::Dictionary(type0_dict));
5849 doc
5850 }
5851
5852 #[test]
5855 fn cid_w_array_consecutive() {
5856 let w = vec![
5857 Object::Integer(120),
5858 Object::Array(vec![
5859 Object::Integer(500),
5860 Object::Integer(600),
5861 Object::Integer(700),
5862 ]),
5863 ];
5864 let doc = build_pdf_with_cid_font(w, None);
5865 let _fonts = extract_embedded_fonts(&doc);
5866
5867 for obj in doc.objects.values() {
5870 let dict = match obj.as_dict() {
5871 Ok(d) => d,
5872 Err(_) => continue,
5873 };
5874 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5875 if subtype == Some(b"Type0".as_slice()) {
5876 let result = extract_cid_font_widths(&doc, dict);
5877 let (first, widths) = result.expect("should parse /W array");
5878 assert_eq!(first, 120);
5879 assert_eq!(widths.len(), 3);
5880 assert_eq!(widths[0], 500); assert_eq!(widths[1], 600); assert_eq!(widths[2], 700); return;
5884 }
5885 }
5886 panic!("Type0 font not found in test document");
5887 }
5888
5889 #[test]
5892 fn cid_w_array_range() {
5893 let w = vec![
5894 Object::Integer(200),
5895 Object::Integer(300),
5896 Object::Integer(250),
5897 ];
5898 let doc = build_pdf_with_cid_font(w, None);
5899
5900 for obj in doc.objects.values() {
5901 let dict = match obj.as_dict() {
5902 Ok(d) => d,
5903 Err(_) => continue,
5904 };
5905 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5906 if subtype == Some(b"Type0".as_slice()) {
5907 let (first, widths) =
5908 extract_cid_font_widths(&doc, dict).expect("should parse /W range");
5909 assert_eq!(first, 200);
5910 assert_eq!(widths.len(), 101); assert!(widths.iter().all(|&w| w == 250));
5912 return;
5913 }
5914 }
5915 panic!("Type0 font not found");
5916 }
5917
5918 #[test]
5923 fn cid_w_array_mixed() {
5924 let w = vec![
5925 Object::Integer(120),
5926 Object::Array(vec![
5927 Object::Integer(500),
5928 Object::Integer(600),
5929 Object::Integer(700),
5930 ]),
5931 Object::Integer(200),
5932 Object::Integer(300),
5933 Object::Integer(250),
5934 ];
5935 let doc = build_pdf_with_cid_font(w, Some(1000));
5936
5937 for obj in doc.objects.values() {
5938 let dict = match obj.as_dict() {
5939 Ok(d) => d,
5940 Err(_) => continue,
5941 };
5942 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5943 if subtype == Some(b"Type0".as_slice()) {
5944 let (first, widths) =
5945 extract_cid_font_widths(&doc, dict).expect("should parse mixed /W");
5946 assert_eq!(first, 120);
5947 assert_eq!(widths.len(), 181); assert_eq!(widths[0], 500); assert_eq!(widths[1], 600); assert_eq!(widths[2], 700); assert_eq!(widths[3], 1000); assert_eq!(widths[79], 1000); assert_eq!(widths[80], 250); assert_eq!(widths[180], 250); return;
5959 }
5960 }
5961 panic!("Type0 font not found");
5962 }
5963
5964 #[test]
5966 fn cid_w_array_default_width() {
5967 let w = vec![
5968 Object::Integer(10),
5969 Object::Array(vec![Object::Integer(400)]),
5970 Object::Integer(20),
5971 Object::Array(vec![Object::Integer(600)]),
5972 ];
5973 let doc = build_pdf_with_cid_font(w, None); for obj in doc.objects.values() {
5976 let dict = match obj.as_dict() {
5977 Ok(d) => d,
5978 Err(_) => continue,
5979 };
5980 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5981 if subtype == Some(b"Type0".as_slice()) {
5982 let (first, widths) = extract_cid_font_widths(&doc, dict).expect("should parse /W");
5983 assert_eq!(first, 10);
5984 assert_eq!(widths[0], 400); assert_eq!(widths[5], 1000); assert_eq!(widths[10], 600); return;
5988 }
5989 }
5990 panic!("Type0 font not found");
5991 }
5992
5993 #[test]
5994 fn extract_embedded_fonts_keeps_simple_pdf_fonts_without_fontfile() {
5995 let mut doc = Document::new();
5996 let font_id = doc.add_object(Object::Dictionary(dictionary! {
5997 "Type" => Object::Name(b"Font".to_vec()),
5998 "Subtype" => Object::Name(b"Type1".to_vec()),
5999 "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
6000 "FirstChar" => Object::Integer(32),
6001 "LastChar" => Object::Integer(34),
6002 "Widths" => Object::Array(vec![
6003 Object::Integer(278),
6004 Object::Integer(333),
6005 Object::Integer(612),
6006 ]),
6007 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
6008 }));
6009
6010 let fonts = extract_embedded_fonts(&doc);
6011 let font = fonts
6012 .iter()
6013 .find(|font| font.name == "MyriadPro-Regular")
6014 .expect("expected reusable simple font");
6015
6016 assert!(font.data.is_empty(), "no FontFile* should keep data empty");
6017 assert_eq!(font.pdf_widths, Some((32, vec![278, 333, 612])));
6018 assert_eq!(
6019 font.pdf_source_font,
6020 Some(PdfSourceFont { object_id: font_id })
6021 );
6022 }
6023
6024 #[test]
6025 fn store_font_data_reserves_family_alias_for_regular_face() {
6026 let mut fonts = Vec::new();
6027 store_font_data(
6028 &mut fonts,
6029 "ArialMT",
6030 Vec::new(),
6031 Some((32, vec![278, 333, 611])),
6032 None,
6033 Some(PdfSourceFont { object_id: (1, 0) }),
6034 );
6035 store_font_data(
6036 &mut fonts,
6037 "Arial-BoldMT",
6038 Vec::new(),
6039 Some((32, vec![278, 333, 611])),
6040 None,
6041 Some(PdfSourceFont { object_id: (2, 0) }),
6042 );
6043 store_font_data(
6044 &mut fonts,
6045 "Arial-ItalicMT",
6046 Vec::new(),
6047 Some((32, vec![278, 333, 611])),
6048 None,
6049 Some(PdfSourceFont { object_id: (3, 0) }),
6050 );
6051
6052 let aliases: Vec<_> = fonts.iter().map(|font| font.name.as_str()).collect();
6053 assert!(aliases.contains(&"ArialMT"));
6054 assert!(aliases.contains(&"Arial-BoldMT"));
6055 assert!(aliases.contains(&"Arial-ItalicMT"));
6056 assert_eq!(
6057 aliases.iter().filter(|name| **name == "Arial").count(),
6058 1,
6059 "only the regular face should claim the bare family alias"
6060 );
6061 }
6062
6063 #[test]
6064 fn store_font_data_keeps_regular_ps_family_alias() {
6065 let mut fonts = Vec::new();
6066 store_font_data(
6067 &mut fonts,
6068 "MyriadPro-Regular",
6069 Vec::new(),
6070 Some((32, vec![278, 333, 612])),
6071 None,
6072 Some(PdfSourceFont { object_id: (4, 0) }),
6073 );
6074
6075 assert!(
6076 fonts.iter().any(|font| font.name == "Myriad Pro"),
6077 "regular PostScript names should still expose their family alias"
6078 );
6079 }
6080
6081 #[test]
6082 fn page_content_streams_resolves_indirect_contents_arrays() {
6083 let mut doc = Document::new();
6084 let stream_a = doc.add_object(Stream::new(
6085 dictionary! {"Length" => Object::Integer(8)},
6086 b"(A) Tj\n".to_vec(),
6087 ));
6088 let stream_b = doc.add_object(Stream::new(
6089 dictionary! {"Length" => Object::Integer(8)},
6090 b"(B) Tj\n".to_vec(),
6091 ));
6092 let contents_array = doc.add_object(Object::Array(vec![
6093 Object::Reference(stream_a),
6094 Object::Reference(stream_b),
6095 ]));
6096 let page_id = doc.add_object(Object::Dictionary(dictionary! {
6097 "Type" => Object::Name(b"Page".to_vec()),
6098 "Contents" => Object::Reference(contents_array),
6099 }));
6100
6101 let streams = page_content_streams(&doc, page_id);
6102
6103 assert_eq!(
6104 streams.len(),
6105 2,
6106 "indirect /Contents arrays must be traversed"
6107 );
6108 assert!(streams[0].windows(2).any(|w| w == b"Tj"));
6109 assert!(streams[1].windows(2).any(|w| w == b"Tj"));
6110 }
6111
6112 #[test]
6113 fn embed_resolved_fonts_reuses_existing_pdf_font_object() {
6114 let mut doc = Document::new();
6115 let source_font_id = doc.add_object(Object::Dictionary(dictionary! {
6116 "Type" => Object::Name(b"Font".to_vec()),
6117 "Subtype" => Object::Name(b"Type1".to_vec()),
6118 "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
6119 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
6120 }));
6121 let before = doc.objects.len();
6122
6123 let mut resolved = HashMap::new();
6124 resolved.insert(
6125 "Myriad Pro_Normal_Normal".to_string(),
6126 ResolvedFont {
6127 name: "Myriad Pro".to_string(),
6128 data: Vec::new(),
6129 face_index: 0,
6130 units_per_em: 1000,
6131 ascender: 800,
6132 descender: -200,
6133 pdf_widths: Some((32, vec![278, 333, 612])),
6134 pdf_encoding: None,
6135 pdf_source_font: Some(PdfSourceFont {
6136 object_id: source_font_id,
6137 }),
6138 },
6139 );
6140
6141 let empty_layout = LayoutDom { pages: vec![] };
6142 let (_font_map, font_objects, metrics_data) =
6143 embed_resolved_fonts(&mut doc, &resolved, &empty_layout);
6144
6145 assert_eq!(
6146 doc.objects.len(),
6147 before,
6148 "should not embed a new font object"
6149 );
6150 assert_eq!(font_objects.len(), 1);
6151 assert_eq!(font_objects[0].1, source_font_id);
6152 assert!(
6153 metrics_data["Myriad Pro_Normal_Normal"].font_data.is_none(),
6154 "reused simple fonts must keep WinAnsi text encoding"
6155 );
6156 }
6157
6158 #[test]
6159 fn strip_undefined_entities_preserves_raw_ampersands_in_processing_instructions() {
6160 let xml = r##"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><?renderCache.textRun 24 A. Adjustment & Location 0 1417 14917 0 0 0 "Myriad Pro" 0 0 18000 ISO-8859-1?><?renderCache.subset "Arial" 0 0 ISO-8859-1 "#$%&'()+,-./" ?><subform name="form1"><field name="A"/></subform></template>"##;
6161
6162 let stripped = strip_undefined_xml_entities(xml);
6163
6164 assert_eq!(
6165 stripped, xml,
6166 "raw ampersands inside processing instructions are valid and must survive sanitization"
6167 );
6168 roxmltree::Document::parse(&stripped)
6169 .expect("processing instructions must remain parseable");
6170 }
6171
6172 #[test]
6173 fn strip_undefined_entities_drops_only_true_named_entity_references() {
6174 let xml = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="form1"><draw name="D"><value><text>alpha &bogus; beta © & gamma</text></value></draw></subform></template>"#;
6175
6176 let stripped = strip_undefined_xml_entities(xml);
6177
6178 assert!(
6179 !stripped.contains("&bogus;"),
6180 "unknown named entities should still be removed for roxmltree compatibility"
6181 );
6182 assert!(stripped.contains("©"));
6183 assert!(stripped.contains("&"));
6184 roxmltree::Document::parse(&stripped).expect("sanitized XML should parse");
6185 }
6186
6187 #[test]
6190 fn form_dom_expands_repeating_subform_instances() {
6191 use xfa_layout_engine::form::FormNodeType;
6192
6193 let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6195 <subform name="root" layout="tb">
6196 <pageSet><pageArea name="P1">
6197 <contentArea w="200mm" h="280mm"/>
6198 <medium short="210mm" long="297mm"/>
6199 </pageArea></pageSet>
6200 <subform name="body" layout="tb">
6201 <subform name="Items" layout="tb">
6202 <bind match="none"/>
6203 <subform name="Row" layout="tb">
6204 <bind match="none"/>
6205 <occur max="-1"/>
6206 <field name="Label"><ui><textEdit/></ui></field>
6207 </subform>
6208 </subform>
6209 </subform>
6210 </subform>
6211 </template>"#;
6212
6213 let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6215 <subform name="root">
6216 <subform name="body">
6217 <subform name="Items">
6218 <instanceManager name="_Row"/>
6219 <subform name="Row">
6220 <field name="Label"><value><text>Alpha</text></value></field>
6221 </subform>
6222 <subform name="Row">
6223 <field name="Label"><value><text>Beta</text></value></field>
6224 </subform>
6225 <subform name="Row">
6226 <field name="Label"><value><text>Gamma</text></value></field>
6227 </subform>
6228 </subform>
6229 </subform>
6230 </subform>
6231 </form>"#;
6232
6233 let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6234 let merger = crate::merger::FormMerger::new(&data_dom);
6235 let (mut tree, root_id) = merger.merge(template).unwrap();
6236
6237 fn find_by_name(tree: &FormTree, parent: FormNodeId, name: &str) -> Option<FormNodeId> {
6240 for &c in &tree.get(parent).children {
6241 if tree.get(c).name == name {
6242 return Some(c);
6243 }
6244 if let Some(found) = find_by_name(tree, c, name) {
6245 return Some(found);
6246 }
6247 }
6248 None
6249 }
6250 let items_id =
6251 find_by_name(&tree, root_id, "Items").expect("Items subform not found in tree");
6252 let rows_before = tree
6253 .get(items_id)
6254 .children
6255 .iter()
6256 .filter(|&&c| tree.get(c).name == "Row")
6257 .count();
6258 assert_eq!(
6259 rows_before, 1,
6260 "template merge should produce 1 Row (bind=none)"
6261 );
6262
6263 let _ = apply_form_dom_presence(
6265 &mut tree,
6266 root_id,
6267 form_xml,
6268 XfaRenderingPolicy::SavedStateFaithful,
6269 false,
6270 );
6271
6272 let rows_after: Vec<FormNodeId> = tree
6274 .get(items_id)
6275 .children
6276 .iter()
6277 .filter(|&&c| tree.get(c).name == "Row")
6278 .copied()
6279 .collect();
6280 assert_eq!(
6281 rows_after.len(),
6282 3,
6283 "form DOM should expand to 3 Row instances"
6284 );
6285
6286 let values: Vec<String> = rows_after
6287 .iter()
6288 .map(|&row_id| {
6289 let label_id = tree.get(row_id).children[0];
6290 match &tree.get(label_id).node_type {
6291 FormNodeType::Field { value } => value.clone(),
6292 _ => String::new(),
6293 }
6294 })
6295 .collect();
6296 assert_eq!(values, vec!["Alpha", "Beta", "Gamma"]);
6297 }
6298
6299 #[test]
6307 fn form_dom_expands_uniform_page_area_template() {
6308 use xfa_layout_engine::form::FormNodeType;
6309
6310 let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6311 <subform name="root" layout="tb">
6312 <pageSet>
6313 <pageArea name="Page1">
6314 <contentArea w="200mm" h="280mm"/>
6315 <medium short="210mm" long="297mm"/>
6316 </pageArea>
6317 </pageSet>
6318 </subform>
6319 </template>"#;
6320
6321 let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6322 <subform name="root">
6323 <pageSet>
6324 <pageArea name="Page1"/>
6325 <pageArea name="Page1"/>
6326 <pageArea name="Page1"/>
6327 <pageArea name="Page1"/>
6328 <pageArea name="Page1"/>
6329 </pageSet>
6330 </subform>
6331 </form>"#;
6332
6333 let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6334 let merger = crate::merger::FormMerger::new(&data_dom);
6335 let (mut tree, root_id) = merger.merge(template).unwrap();
6336
6337 let _ = apply_form_dom_presence(
6338 &mut tree,
6339 root_id,
6340 form_xml,
6341 XfaRenderingPolicy::SavedStateFaithful,
6342 false,
6343 );
6344
6345 fn collect_page_areas(tree: &FormTree, id: FormNodeId, out: &mut Vec<FormNodeId>) {
6346 if matches!(tree.get(id).node_type, FormNodeType::PageArea { .. }) {
6347 out.push(id);
6348 }
6349 for &c in &tree.get(id).children {
6350 collect_page_areas(tree, c, out);
6351 }
6352 }
6353
6354 let mut page_areas = Vec::new();
6355 collect_page_areas(&tree, root_id, &mut page_areas);
6356 assert_eq!(
6357 page_areas.len(),
6358 5,
6359 "uniform pageArea expansion: 5 form-DOM instances must clone the template"
6360 );
6361 for &pa_id in &page_areas {
6362 assert!(
6363 tree.meta(pa_id).runtime_instantiated_page,
6364 "every expanded pageArea must be flagged as runtime-instantiated"
6365 );
6366 }
6367 }
6368
6369 #[test]
6374 fn form_dom_skips_multi_template_page_area_expansion() {
6375 use xfa_layout_engine::form::FormNodeType;
6376
6377 let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6378 <subform name="root" layout="tb">
6379 <pageSet>
6380 <pageArea name="Page1">
6381 <contentArea w="200mm" h="280mm"/>
6382 <medium short="210mm" long="297mm"/>
6383 </pageArea>
6384 <pageArea name="OverFlowPage">
6385 <contentArea w="200mm" h="280mm"/>
6386 <medium short="210mm" long="297mm"/>
6387 </pageArea>
6388 </pageSet>
6389 </subform>
6390 </template>"#;
6391
6392 let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6393 <subform name="root">
6394 <pageSet>
6395 <pageArea name="Page1"/>
6396 <pageArea name="OverFlowPage"/>
6397 <pageArea name="OverFlowPage"/>
6398 <pageArea name="OverFlowPage"/>
6399 <pageArea name="OverFlowPage"/>
6400 </pageSet>
6401 </subform>
6402 </form>"#;
6403
6404 let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6405 let merger = crate::merger::FormMerger::new(&data_dom);
6406 let (mut tree, root_id) = merger.merge(template).unwrap();
6407
6408 let _ = apply_form_dom_presence(
6409 &mut tree,
6410 root_id,
6411 form_xml,
6412 XfaRenderingPolicy::SavedStateFaithful,
6413 false,
6414 );
6415
6416 fn collect_page_areas(tree: &FormTree, id: FormNodeId, out: &mut Vec<FormNodeId>) {
6417 if matches!(tree.get(id).node_type, FormNodeType::PageArea { .. }) {
6418 out.push(id);
6419 }
6420 for &c in &tree.get(id).children {
6421 collect_page_areas(tree, c, out);
6422 }
6423 }
6424
6425 let mut page_areas = Vec::new();
6426 collect_page_areas(&tree, root_id, &mut page_areas);
6427 assert_eq!(
6428 page_areas.len(),
6429 2,
6430 "multi-template pageSet must not clone pageAreas (kept original 2)"
6431 );
6432 for &pa_id in &page_areas {
6433 assert!(
6434 !tree.meta(pa_id).runtime_instantiated_page,
6435 "non-expansion case must not set runtime_instantiated_page flag"
6436 );
6437 }
6438 }
6439
6440 #[test]
6454 fn formdom_admit_databound_override_admits_only_data_bound() {
6455 use xfa_layout_engine::form::{FormNodeType, Presence};
6456
6457 let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
6459 <subform name="root" layout="tb">
6460 <subform name="body" layout="tb">
6461 <subform name="Bound" layout="tb">
6462 <field name="A"><ui><textEdit/></ui></field>
6463 </subform>
6464 <subform name="Unbound" layout="tb">
6465 <field name="B"><ui><textEdit/></ui></field>
6466 </subform>
6467 <subform name="Present" layout="tb">
6468 <field name="C"><ui><textEdit/></ui></field>
6469 </subform>
6470 </subform>
6471 </subform>
6472 </template>"#;
6473
6474 let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
6478 <subform name="root">
6479 <subform name="body">
6480 <subform name="Present">
6481 <field name="C"><value><text>x</text></value></field>
6482 </subform>
6483 </subform>
6484 </subform>
6485 </form>"#;
6486
6487 fn find(tree: &FormTree, parent: FormNodeId, name: &str) -> Option<FormNodeId> {
6488 for &c in &tree.get(parent).children {
6489 if tree.get(c).name == name {
6490 return Some(c);
6491 }
6492 if let Some(f) = find(tree, c, name) {
6493 return Some(f);
6494 }
6495 }
6496 None
6497 }
6498
6499 let scenario = |policy: XfaRenderingPolicy, override_on: bool| {
6503 let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
6504 let merger = crate::merger::FormMerger::new(&data_dom);
6505 let (mut tree, root_id) = merger.merge(template).unwrap();
6506
6507 let bound = find(&tree, root_id, "Bound").expect("Bound subform in tree");
6508 let unbound = find(&tree, root_id, "Unbound").expect("Unbound subform in tree");
6509 assert!(matches!(tree.get(bound).node_type, FormNodeType::Subform));
6510 assert!(matches!(tree.get(unbound).node_type, FormNodeType::Subform));
6511
6512 for &id in &[bound, unbound] {
6513 let m = tree.meta_mut(id);
6514 m.presence = Presence::Visible;
6515 m.is_zero_instance_prototype = false;
6516 m.data_bind_none = false;
6517 }
6518 tree.meta_mut(bound).bound_data_node = Some(0); tree.meta_mut(unbound).bound_data_node = None; let (admitted, _, _) =
6522 apply_form_dom_presence(&mut tree, root_id, form_xml, policy, override_on);
6523 (
6524 admitted,
6525 tree.meta(bound).presence,
6526 tree.meta(unbound).presence,
6527 )
6528 };
6529
6530 let (adm, bound_p, unbound_p) = scenario(XfaRenderingPolicy::SavedStateFaithful, false);
6534 assert_eq!(
6535 adm, 0,
6536 "override off admits nothing under SavedStateFaithful"
6537 );
6538 assert_eq!(
6539 bound_p,
6540 Presence::Hidden,
6541 "data-bound suppressed when override off"
6542 );
6543 assert_eq!(
6544 unbound_p,
6545 Presence::Hidden,
6546 "non-data suppressed when override off"
6547 );
6548
6549 let (adm, bound_p, unbound_p) = scenario(XfaRenderingPolicy::SavedStateFaithful, true);
6553 assert_eq!(adm, 1, "override on admits exactly the data-bound subform");
6554 assert_eq!(
6555 bound_p,
6556 Presence::Visible,
6557 "data-bound admitted when override on"
6558 );
6559 assert_eq!(
6560 unbound_p,
6561 Presence::Hidden,
6562 "non-data subform must stay suppressed with override on (over-pagination guard)"
6563 );
6564
6565 let (adm, bound_p, unbound_p) = scenario(XfaRenderingPolicy::FreshMergeExperimental, false);
6568 assert_eq!(
6569 adm, 1,
6570 "fresh-merge admits the data-bound subform without override"
6571 );
6572 assert_eq!(
6573 bound_p,
6574 Presence::Visible,
6575 "data-bound admitted under fresh-merge"
6576 );
6577 assert_eq!(
6578 unbound_p,
6579 Presence::Hidden,
6580 "non-data suppressed under fresh-merge"
6581 );
6582 }
6583
6584 #[test]
6590 fn flatten_xfa_to_pdf_recursion_guard_returns_error() {
6591 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6592 let result = flatten_xfa_to_pdf_simulate_reentrant(&pdf_bytes);
6593 assert!(
6594 result.is_err(),
6595 "expected recursion guard to return Err, got Ok"
6596 );
6597 let err_msg = result.unwrap_err().to_string();
6598 assert!(
6599 err_msg.contains("recursively"),
6600 "expected error message to mention recursion, got: {err_msg}"
6601 );
6602 }
6603
6604 #[test]
6607 fn flatten_xfa_to_pdf_depth_counter_resets_after_call() {
6608 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6609 let _ = flatten_xfa_to_pdf(&pdf_bytes);
6611 let pdf_bytes2 = build_xfa_pdf(SIMPLE_XDP);
6613 let result = flatten_xfa_to_pdf(&pdf_bytes2);
6614 assert!(
6615 result.is_ok(),
6616 "second flatten call should succeed, got: {result:?}"
6617 );
6618 }
6619
6620 #[test]
6624 fn flatten_xfa_to_pdf_does_not_panic_on_empty_input() {
6625 let result = flatten_xfa_to_pdf(&[]);
6626 let _ = result;
6630 }
6631
6632 #[test]
6640 fn flatten_pipeline_completes_on_minimal_xfa_pdf() {
6641 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6642 let result = flatten_xfa_to_pdf(&pdf_bytes);
6646 let _ = result; }
6648
6649 #[test]
6650 fn flatten_with_layout_dump_preserves_pdf_bytes() {
6651 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6652 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("plain flatten should succeed");
6653 let (flattened_with_dump, layout_dump) =
6654 flatten_xfa_to_pdf_with_layout_dump(&pdf_bytes).expect("dump flatten should succeed");
6655
6656 assert_eq!(flattened_with_dump, flattened);
6657 assert!(!layout_dump.pages.is_empty());
6658 assert_eq!(layout_dump.pages[0].page_num, 1);
6659 assert!(layout_dump.pages[0].used_height <= layout_dump.pages[0].page_height);
6660 }
6661
6662 #[test]
6668 fn flatten_removes_needs_rendering() {
6669 let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6671 {
6673 let mut doc = Document::load_mem(&pdf_bytes).expect("parse for NeedsRendering test");
6674 let root_id = match doc.trailer.get(b"Root") {
6675 Ok(Object::Reference(id)) => *id,
6676 _ => panic!("no Root in trailer"),
6677 };
6678 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
6679 dict.set("NeedsRendering", Object::Boolean(true));
6680 }
6681 let mut out = Vec::new();
6682 doc.save_to(&mut out)
6683 .expect("re-save for NeedsRendering test");
6684 pdf_bytes = out;
6685 }
6686
6687 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
6689 let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
6690 let root_id = match doc.trailer.get(b"Root") {
6691 Ok(Object::Reference(id)) => *id,
6692 _ => panic!("no Root in flattened trailer"),
6693 };
6694 let catalog = doc.get_dictionary(root_id).expect("catalog dict");
6695 assert!(
6696 catalog.get(b"NeedsRendering").is_err(),
6697 "/NeedsRendering must be absent after flatten"
6698 );
6699 }
6700
6701 #[test]
6704 fn flatten_removes_xfa_entry() {
6705 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6706 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
6707
6708 let flattened_str = String::from_utf8_lossy(&flattened);
6711 assert!(
6712 !flattened_str.contains("/XFA"),
6713 "/XFA must be absent from flattened output, but was found"
6714 );
6715 }
6716
6717 #[test]
6718 fn remove_acroform_purges_xfa_packet_objects() {
6719 let (mut doc, acroform_id, xfa_ids) = build_xfa_doc_with_xfa_array();
6720
6721 remove_acroform(&mut doc);
6722
6723 assert!(
6724 !doc.objects.contains_key(&acroform_id),
6725 "AcroForm object should be removed from doc.objects"
6726 );
6727 for xfa_id in &xfa_ids {
6728 assert!(
6729 !doc.objects.contains_key(xfa_id),
6730 "XFA packet object {xfa_id:?} should be removed from doc.objects"
6731 );
6732 }
6733
6734 let mut out = Vec::new();
6735 doc.save_to(&mut out).expect("save cleaned PDF");
6736 let out_str = String::from_utf8_lossy(&out);
6737 assert!(
6738 !out_str.contains("xdp:xdp"),
6739 "serialized output should not contain orphaned XFA packet payloads"
6740 );
6741 assert!(
6742 !out_str.contains("<template"),
6743 "serialized output should not contain orphaned template payloads"
6744 );
6745 }
6746
6747 #[test]
6749 fn flatten_removes_empty_annots_arrays() {
6750 let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
6752 {
6753 let mut doc = Document::load_mem(&pdf_bytes).expect("parse for annots test");
6754 let page_id = doc.page_iter().next().expect("at least one page");
6755 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
6756 dict.set("Annots", Object::Array(vec![]));
6757 }
6758 let mut out = Vec::new();
6759 doc.save_to(&mut out).expect("re-save for annots test");
6760 pdf_bytes = out;
6761 }
6762
6763 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
6764 let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
6765 for page_id in doc.page_iter() {
6766 let page = doc.get_dictionary(page_id).expect("page dict");
6767 if let Ok(Object::Array(arr)) = page.get(b"Annots") {
6768 assert!(
6770 !arr.is_empty(),
6771 "page {:?}: /Annots must either be absent or non-empty after flatten",
6772 page_id
6773 );
6774 }
6775 }
6776 }
6777
6778 #[test]
6779 fn remove_acroform_strips_widgets_from_indirect_annots_arrays() {
6780 let appearance = Object::Stream(Stream::new(
6781 dictionary! {
6782 "Type" => Object::Name(b"XObject".to_vec()),
6783 "Subtype" => Object::Name(b"Form".to_vec()),
6784 "BBox" => Object::Array(vec![
6785 Object::Integer(0), Object::Integer(0),
6786 Object::Integer(20), Object::Integer(20),
6787 ]),
6788 "Resources" => Object::Dictionary(dictionary! {}),
6789 },
6790 b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
6791 ));
6792 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
6793 Vec::new(),
6794 appearance,
6795 dictionary! {
6796 "FT" => Object::Name(b"Tx".to_vec()),
6797 "T" => Object::string_literal("field[0]"),
6798 },
6799 );
6800
6801 let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
6802 let page_id = doc.page_iter().next().expect("page");
6803 let annots = page_annotations(&doc, page_id);
6804 let annots_id = doc.add_object(Object::Array(annots));
6805 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
6806 page_dict.set("Annots", Object::Reference(annots_id));
6807 }
6808
6809 remove_acroform(&mut doc);
6810
6811 let page = doc.get_dictionary(page_id).expect("page dict");
6812 assert!(
6813 page.get(b"Annots").is_err(),
6814 "widget-only indirect /Annots must be removed"
6815 );
6816 }
6817
6818 #[test]
6819 fn acroform_without_xfa_falls_back_to_static_cleanup() {
6820 let appearance = Object::Stream(Stream::new(
6821 dictionary! {
6822 "Type" => Object::Name(b"XObject".to_vec()),
6823 "Subtype" => Object::Name(b"Form".to_vec()),
6824 "BBox" => Object::Array(vec![
6825 Object::Integer(0), Object::Integer(0),
6826 Object::Integer(20), Object::Integer(20),
6827 ]),
6828 "Resources" => Object::Dictionary(dictionary! {}),
6829 },
6830 b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
6831 ));
6832 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
6833 Vec::new(),
6834 appearance,
6835 dictionary! {
6836 "FT" => Object::Name(b"Tx".to_vec()),
6837 "T" => Object::string_literal("field[0]"),
6838 },
6839 );
6840
6841 let mut doc = Document::load_mem(&pdf_bytes).expect("parse source PDF");
6842 let root_id = match doc.trailer.get(b"Root") {
6843 Ok(Object::Reference(id)) => *id,
6844 _ => panic!("no Root"),
6845 };
6846 let acroform_id = doc
6847 .get_dictionary(root_id)
6848 .expect("catalog")
6849 .get(b"AcroForm")
6850 .expect("AcroForm")
6851 .as_reference()
6852 .expect("AcroForm ref");
6853 if let Ok(Object::Dictionary(ref mut acroform)) = doc.get_object_mut(acroform_id) {
6854 acroform.remove(b"XFA");
6855 }
6856 let mut acroform_only = Vec::new();
6857 doc.save_to(&mut acroform_only)
6858 .expect("save AcroForm-only PDF");
6859
6860 let flattened = flatten_xfa_to_pdf(&acroform_only).expect("flatten failed");
6861 let flattened_doc = Document::load_mem(&flattened).expect("parse flattened PDF");
6862 let root_id = match flattened_doc.trailer.get(b"Root") {
6863 Ok(Object::Reference(id)) => *id,
6864 _ => panic!("no Root in flattened PDF"),
6865 };
6866 let catalog = flattened_doc
6867 .get_dictionary(root_id)
6868 .expect("flattened catalog");
6869 assert!(
6870 catalog.get(b"AcroForm").is_err(),
6871 "AcroForm-only PDFs should still be cleaned by flatten"
6872 );
6873
6874 let page_id = flattened_doc.page_iter().next().expect("flattened page");
6875 assert!(
6876 page_annotations(&flattened_doc, page_id).is_empty(),
6877 "flattened AcroForm-only PDFs should not retain widget annotations"
6878 );
6879 }
6880
6881 #[test]
6887 fn validate_flattened_pdf_clean_pdf_passes() {
6888 let mut doc = Document::with_version("1.4");
6890 let pages_id = doc.new_object_id();
6891 let page_id = doc.add_object(Object::Dictionary(dictionary! {
6892 "Type" => Object::Name(b"Page".to_vec()),
6893 "Parent" => Object::Reference(pages_id),
6894 "MediaBox" => Object::Array(vec![
6895 Object::Integer(0), Object::Integer(0),
6896 Object::Integer(612), Object::Integer(792),
6897 ])
6898 }));
6899 doc.objects.insert(
6900 pages_id,
6901 Object::Dictionary(dictionary! {
6902 "Type" => Object::Name(b"Pages".to_vec()),
6903 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
6904 "Count" => Object::Integer(1)
6905 }),
6906 );
6907 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
6908 "Type" => Object::Name(b"Catalog".to_vec()),
6909 "Pages" => Object::Reference(pages_id)
6910 }));
6911 doc.trailer.set("Root", Object::Reference(catalog_id));
6912 let mut pdf_bytes = Vec::new();
6913 doc.save_to(&mut pdf_bytes).expect("save clean PDF");
6914
6915 let validation = validate_flattened_pdf(&pdf_bytes).expect("validate failed");
6916 assert!(
6917 validation.has_no_acroform,
6918 "clean PDF should have no AcroForm"
6919 );
6920 assert!(validation.has_no_xfa, "clean PDF should have no XFA");
6921 assert!(
6922 validation.has_no_needs_rendering,
6923 "clean PDF should have no NeedsRendering"
6924 );
6925 assert_eq!(validation.page_count, 1, "clean PDF should report 1 page");
6926 assert!(
6927 validation.warnings.is_empty(),
6928 "clean PDF should produce no warnings, got: {:?}",
6929 validation.warnings
6930 );
6931 }
6932
6933 #[test]
6935 fn validate_flattened_pdf_does_not_panic_on_empty_input() {
6936 let result = validate_flattened_pdf(&[]);
6937 assert!(
6939 result.is_ok(),
6940 "expected Ok from empty input, got: {:?}",
6941 result.err()
6942 );
6943 let v = result.unwrap();
6944 assert_eq!(v.page_count, 0, "empty input has 0 pages");
6945 assert!(
6946 !v.warnings.is_empty(),
6947 "empty input should produce at least one warning"
6948 );
6949 }
6950
6951 #[test]
6957 fn compare_flatten_quality_page_count_comparison() {
6958 let original = build_xfa_pdf(SIMPLE_XDP);
6959 let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6960 let metrics =
6961 compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6962 assert!(
6964 metrics.page_count_before >= 1,
6965 "original must have >= 1 page"
6966 );
6967 assert!(
6968 metrics.page_count_after >= 1,
6969 "flattened must have >= 1 page"
6970 );
6971 assert_eq!(
6973 metrics.page_count_match,
6974 metrics.page_count_before == metrics.page_count_after,
6975 "page_count_match must equal page_count_before == page_count_after"
6976 );
6977 }
6978
6979 #[test]
6981 fn compare_flatten_quality_content_ratio_computed() {
6982 let original = build_xfa_pdf(SIMPLE_XDP);
6983 let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6984 let metrics =
6985 compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6986 assert!(
6988 metrics.content_ratio.is_finite() && metrics.content_ratio >= 0.0,
6989 "content_ratio must be finite and >= 0, got: {}",
6990 metrics.content_ratio
6991 );
6992 let expected = if metrics.content_stream_bytes_before == 0 {
6994 1.0_f64
6995 } else {
6996 metrics.content_stream_bytes_after as f64 / metrics.content_stream_bytes_before as f64
6997 };
6998 assert!(
6999 (metrics.content_ratio - expected).abs() < 1e-9,
7000 "content_ratio mismatch: expected {expected}, got {}",
7001 metrics.content_ratio
7002 );
7003 }
7004
7005 #[test]
7012 fn validate_text_completeness_no_datasets_returns_perfect_ratio() {
7013 let xdp = r#"<?xml version="1.0"?>
7015<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
7016 <template>
7017 <subform name="root">
7018 <field name="greeting"><ui><textEdit/></ui></field>
7019 </subform>
7020 </template>
7021</xdp:xdp>"#;
7022 let original = build_xfa_pdf(xdp);
7023 let mut doc = Document::with_version("1.4");
7025 let pages_id = doc.new_object_id();
7026 let page_id = doc.add_object(Object::Dictionary(dictionary! {
7027 "Type" => Object::Name(b"Page".to_vec()),
7028 "Parent" => Object::Reference(pages_id),
7029 "MediaBox" => Object::Array(vec![
7030 Object::Integer(0), Object::Integer(0),
7031 Object::Integer(612), Object::Integer(792),
7032 ])
7033 }));
7034 doc.objects.insert(
7035 pages_id,
7036 Object::Dictionary(dictionary! {
7037 "Type" => Object::Name(b"Pages".to_vec()),
7038 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
7039 "Count" => Object::Integer(1)
7040 }),
7041 );
7042 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
7043 "Type" => Object::Name(b"Catalog".to_vec()),
7044 "Pages" => Object::Reference(pages_id)
7045 }));
7046 doc.trailer.set("Root", Object::Reference(catalog_id));
7047 let mut flattened = Vec::new();
7048 doc.save_to(&mut flattened).unwrap();
7049
7050 let result = validate_text_completeness(&original, &flattened)
7051 .expect("validate_text_completeness should not fail");
7052 assert!(
7053 result.expected_values.is_empty(),
7054 "no datasets packet means no expected values"
7055 );
7056 assert_eq!(
7057 result.completeness_ratio, 1.0,
7058 "empty expected set should yield ratio 1.0"
7059 );
7060 }
7061
7062 #[test]
7064 fn validate_text_completeness_empty_inputs_do_not_panic() {
7065 let result = validate_text_completeness(&[], &[]);
7066 assert!(result.is_ok(), "should return Ok on empty inputs");
7067 let v = result.unwrap();
7068 assert_eq!(v.completeness_ratio, 1.0);
7069 assert!(v.expected_values.is_empty());
7070 assert!(v.missing_values.is_empty());
7071 }
7072
7073 #[test]
7083 fn flatten_empty_bytes_does_not_panic_and_does_not_error() {
7084 let _ = flatten_xfa_to_pdf(b"");
7088 }
7089
7090 #[test]
7093 fn flatten_non_xfa_bytes_returns_input_unchanged() {
7094 let input = b"%PDF-1.4\n%%EOF\n";
7098 if let Ok(out) = flatten_xfa_to_pdf(input) {
7100 assert_eq!(out, input, "non-XFA input should pass through unchanged");
7101 }
7102 }
7103}