1use lopdf::{dictionary, Dictionary, Document, Object, ObjectId, Stream, StringFormat};
54use std::cell::Cell;
55use std::collections::{HashMap, HashSet};
56use std::fmt::Write as FmtWrite;
57use std::thread;
58use std::time::Duration;
59
60thread_local! {
75 static FLATTEN_DEPTH: Cell<u32> = const { Cell::new(0) };
76}
77
78#[cfg(feature = "xfa-js-sandboxed")]
79use crate::dynamic::apply_dynamic_scripts_with_runtime;
80use crate::dynamic::{
81 apply_dynamic_scripts, apply_dynamic_scripts_with_mode, DynamicScriptOutcome, JsExecutionMode,
82 OutputQuality,
83};
84use crate::error::{Result, XfaError};
85use crate::extract::extract_xfa_from_bytes;
86use crate::font_bridge::{
87 font_variant_key, pdf_glyph_name_to_unicode, CidFontInfo, EmbeddedFontData, PdfBaseEncoding,
88 PdfSimpleEncoding, PdfSourceFont, ResolvedFont, XfaFontResolver, XfaFontSpec,
89};
90use crate::image_bridge::embed_image;
91use crate::javascript_policy::{self, JavaScriptEntryPoint};
92use crate::merger::FormMerger;
93use crate::render_bridge::{
94 generate_all_overlays, generate_field_values_overlays, unicode_to_winansi, FontMetricsData,
95 PageOverlay, XfaRenderConfig,
96};
97use xfa_dom_resolver::data_dom::DataDom;
98use xfa_layout_engine::form::{DrawContent, FormNodeId, FormNodeStyle, FormTree};
99use xfa_layout_engine::layout::{
100 LayoutContent, LayoutDom, LayoutEngine, LayoutNode, LayoutProfile,
101};
102use xfa_layout_engine::trace::{sites as trace_sites, Reason as TraceReason};
103
104use crate::adobe_compat::{
105 cap_suppression_by_form_dom, emit_bind_none_summary, emit_non_data_widget_summary,
106 exclude_bind_none_fields_from_page_data_suppression,
107 exclude_non_data_widgets_from_page_suppression,
108 static_xfaf_excess_page_trim_with_form_dom_guard,
109 suppress_empty_pages_only_when_real_data_bound, BindNoneClassification, WidgetClassification,
110};
111
112#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
127enum PipelineStage {
128 Extract = 0,
129 Bind = 1,
130 Layout = 2,
131 Render = 3,
132 Embed = 4,
133 Write = 5,
134 Cleanup = 6,
135}
136
137fn create_minimal_pdf_document() -> Document {
138 let mut doc = Document::new();
139 let pages_id = doc.add_object(Object::Dictionary(dictionary! {
140 "Type" => Object::Name(b"Pages".to_vec()),
141 "Kids" => Object::Array(vec![]),
142 "Count" => Object::Integer(0)
143 }));
144 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
145 "Type" => Object::Name(b"Catalog".to_vec()),
146 "Pages" => Object::Reference(pages_id)
147 }));
148 doc.trailer.set("Root", Object::Reference(catalog_id));
149 doc
150}
151
152#[derive(Debug, Clone, Default)]
154pub struct LayoutDump {
155 pub pages: Vec<LayoutDumpEntry>,
157 pub dynamic_scripts: DynamicScriptOutcome,
159 pub output_quality: OutputQuality,
161}
162
163#[derive(Debug, Clone)]
165pub struct LayoutDumpEntry {
166 pub page_num: u32,
168 pub page_height: f64,
170 pub used_height: f64,
172 pub overflow_to_next: bool,
174 pub first_overflow_element: Option<String>,
176}
177#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
180pub struct FlattenMetadata {
181 pub dynamic_scripts: DynamicScriptOutcome,
183 pub output_quality: OutputQuality,
185}
186
187impl FlattenMetadata {
188 fn from_dynamic_scripts(dynamic_scripts: DynamicScriptOutcome) -> Self {
189 Self {
190 dynamic_scripts,
191 output_quality: dynamic_scripts.output_quality,
192 }
193 }
194}
195
196struct FlattenOutput {
197 pdf_bytes: Vec<u8>,
198 layout_dump: LayoutDump,
199 metadata: FlattenMetadata,
200}
201
202impl FlattenOutput {
203 fn new(
204 pdf_bytes: Vec<u8>,
205 mut layout_dump: LayoutDump,
206 dynamic_scripts: DynamicScriptOutcome,
207 ) -> Self {
208 layout_dump.dynamic_scripts = dynamic_scripts;
209 layout_dump.output_quality = dynamic_scripts.output_quality;
210 Self {
211 pdf_bytes,
212 layout_dump,
213 metadata: FlattenMetadata::from_dynamic_scripts(dynamic_scripts),
214 }
215 }
216
217 fn without_dump(pdf_bytes: Vec<u8>) -> Self {
218 Self::new(
219 pdf_bytes,
220 LayoutDump::default(),
221 DynamicScriptOutcome::default(),
222 )
223 }
224}
225
226pub fn is_pdf_encrypted(pdf_bytes: &[u8]) -> bool {
228 Document::load_mem(pdf_bytes)
229 .map(|doc| doc.trailer.get(b"Encrypt").is_ok())
230 .unwrap_or(false)
231}
232
233enum DecryptResult {
234 NotEncrypted,
235 Decrypted(Vec<u8>),
236 NeedsPassword,
237}
238
239fn try_decrypt_pdf(pdf_bytes: &[u8]) -> DecryptResult {
242 let mut doc = match Document::load_mem(pdf_bytes) {
243 Ok(d) => d,
244 Err(_) => return DecryptResult::NotEncrypted, };
246
247 if doc.was_encrypted() {
251 let mut buf = Vec::new();
253 match doc.save_to(&mut buf) {
254 Ok(()) => return DecryptResult::Decrypted(buf),
255 Err(_) => return DecryptResult::NeedsPassword,
256 }
257 }
258
259 if doc.trailer.get(b"Encrypt").is_ok() {
260 match Document::load_mem_with_password(pdf_bytes, "") {
262 Ok(mut decrypted_doc) => {
263 decrypted_doc.trailer.remove(b"Encrypt");
264 let mut buf = Vec::new();
265 match decrypted_doc.save_to(&mut buf) {
266 Ok(()) => return DecryptResult::Decrypted(buf),
267 Err(_) => return DecryptResult::NeedsPassword,
268 }
269 }
270 Err(_) => return DecryptResult::NeedsPassword,
271 }
272 }
273
274 DecryptResult::NotEncrypted
275}
276
277fn page_has_fields(
287 nodes: &[LayoutNode],
288 tree: &FormTree,
289 bind_none_count: &Cell<usize>,
290 widget_count: &Cell<usize>,
291) -> bool {
292 use xfa_layout_engine::form::{FieldKind, FormNodeType};
293 nodes.iter().any(|n| {
294 let meta = tree.meta(n.form_node);
301 let is_field = matches!(tree.get(n.form_node).node_type, FormNodeType::Field { .. });
302 let field_kind_is_non_data_widget = matches!(
306 meta.field_kind,
307 FieldKind::Signature | FieldKind::Button | FieldKind::Barcode
308 );
309 let widget_class =
310 exclude_non_data_widgets_from_page_suppression(field_kind_is_non_data_widget);
311 let is_non_data_widget =
312 matches!(widget_class, WidgetClassification::ExcludedNonDataWidget);
313 if is_field && is_non_data_widget {
314 widget_count.set(widget_count.get() + 1);
315 }
316 let classification = exclude_bind_none_fields_from_page_data_suppression(
317 is_field,
318 is_non_data_widget,
319 meta.data_bind_none,
320 );
321 if matches!(classification, BindNoneClassification::ExcludedBindNone) {
322 bind_none_count.set(bind_none_count.get() + 1);
323 }
324 let is_data_field = is_field && matches!(classification, BindNoneClassification::DataField);
325 is_data_field || page_has_fields(&n.children, tree, bind_none_count, widget_count)
326 })
327}
328
329fn page_has_field_data(nodes: &[LayoutNode], tree: &FormTree) -> bool {
334 use xfa_layout_engine::form::FormNodeType;
335 nodes.iter().any(|n| {
336 matches!(
337 &tree.get(n.form_node).node_type,
338 FormNodeType::Field { value } if !value.is_empty()
339 ) || page_has_field_data(&n.children, tree)
340 })
341}
342
343#[must_use = "flattened PDF bytes must be used; discarding them loses output"]
386pub fn flatten_xfa_to_pdf(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
387 flatten_xfa_to_pdf_internal(pdf_bytes, false).map(|out| out.pdf_bytes)
388}
389#[must_use = "flattened PDF bytes and layout dump must be used; discarding them loses output"]
391pub fn flatten_xfa_to_pdf_with_layout_dump(pdf_bytes: &[u8]) -> Result<(Vec<u8>, LayoutDump)> {
392 let out = flatten_xfa_to_pdf_internal(pdf_bytes, true)?;
393 Ok((out.pdf_bytes, out.layout_dump))
394}
395#[must_use = "flattened PDF bytes and metadata must be used; discarding them loses output"]
397pub fn flatten_xfa_to_pdf_with_metadata(pdf_bytes: &[u8]) -> Result<(Vec<u8>, FlattenMetadata)> {
398 let out = flatten_xfa_to_pdf_internal(pdf_bytes, false)?;
399 Ok((out.pdf_bytes, out.metadata))
400}
401#[must_use = "flattened PDF bytes, layout dump, and metadata must be used; discarding them loses output"]
403pub fn flatten_xfa_to_pdf_with_layout_dump_and_metadata(
404 pdf_bytes: &[u8],
405) -> Result<(Vec<u8>, LayoutDump, FlattenMetadata)> {
406 let out = flatten_xfa_to_pdf_internal(pdf_bytes, true)?;
407 Ok((out.pdf_bytes, out.layout_dump, out.metadata))
408}
409
410fn flatten_xfa_to_pdf_internal(
411 pdf_bytes: &[u8],
412 collect_layout_dump: bool,
413) -> Result<FlattenOutput> {
414 let depth = FLATTEN_DEPTH.with(|d| d.get());
423 if depth >= 1 {
424 return Err(XfaError::LayoutFailed(
425 "flatten_xfa_to_pdf called recursively — aborting to prevent stack overflow".into(),
426 ));
427 }
428 FLATTEN_DEPTH.with(|d| d.set(depth + 1));
429 struct DepthGuard;
431 impl Drop for DepthGuard {
432 fn drop(&mut self) {
433 FLATTEN_DEPTH.with(|d| d.set(d.get().saturating_sub(1)));
434 }
435 }
436 let _depth_guard = DepthGuard;
437
438 if !pdf_bytes.windows(9).any(|w| w == b"/AcroForm")
442 && !pdf_bytes.windows(7).any(|w| w == b"xdp:xdp")
443 {
444 return Ok(FlattenOutput::without_dump(pdf_bytes.to_vec()));
445 }
446
447 let decrypted;
450 let pdf_bytes = match try_decrypt_pdf(pdf_bytes) {
451 DecryptResult::NotEncrypted => pdf_bytes,
452 DecryptResult::Decrypted(bytes) => {
453 decrypted = bytes;
454 &decrypted
455 }
456 DecryptResult::NeedsPassword => {
457 return Err(XfaError::Encrypted(
458 "PDF is encrypted and requires a password".into(),
459 ));
460 }
461 };
462
463 let packets = match extract_xfa_from_bytes(pdf_bytes.to_vec()) {
465 Ok(p) => p,
466 Err(_) => {
467 return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
471 }
472 };
473
474 let template_xml = match packets.template() {
475 Some(t) => strip_undefined_xml_entities(t),
476 None => {
477 trace_sites::fallback(
480 TraceReason::StaticFallbackTaken,
481 "template packet missing or unparseable",
482 );
483 return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
484 }
485 };
486
487 if is_corrupt_xfa_template(pdf_bytes.len(), &template_xml) {
491 trace_sites::fallback(
492 TraceReason::StaticFallbackTaken,
493 "corrupt or minimal XFA template",
494 );
495 return static_fallback(pdf_bytes).map(FlattenOutput::without_dump);
496 }
497
498 const FLATTEN_TIMEOUT: Duration = Duration::from_secs(30);
506 let pdf_bytes_ref = pdf_bytes.to_vec();
507 let template_xml_owned = template_xml.clone();
508 let datasets_xml_owned = packets.datasets().map(strip_undefined_xml_entities);
509 let form_xml_owned = packets.get_packet("form").map(|s| s.to_string());
510
511 let handle = thread::spawn(move || {
512 xfa_flatten_inner(
513 &pdf_bytes_ref,
514 &template_xml_owned,
515 datasets_xml_owned.as_deref(),
516 form_xml_owned.as_deref(),
517 collect_layout_dump,
518 )
519 });
520
521 match handle.join() {
522 Ok(Ok(out)) => Ok(out),
523 Ok(Err(e @ XfaError::UnsupportedFeature(_))) => Err(e),
524 Ok(Err(e)) => {
525 eprintln!("XFA flatten failed: {e:?}");
526 trace_sites::fallback(
527 TraceReason::StaticFallbackTaken,
528 format!("inner pipeline error: {e:?}"),
529 );
530 static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
531 }
532 Err(_) => {
533 eprintln!("XFA flatten timed out after {:?}", FLATTEN_TIMEOUT);
534 trace_sites::fallback(TraceReason::StaticFallbackTaken, "inner pipeline timeout");
535 static_fallback(pdf_bytes).map(FlattenOutput::without_dump)
536 }
537 }
538}
539
540fn xfa_flatten_inner(
542 pdf_bytes: &[u8],
543 template_xml: &str,
544 datasets_xml: Option<&str>,
545 form_xml: Option<&str>,
546 collect_layout_dump: bool,
547) -> Result<FlattenOutput> {
548 let mut _stage = PipelineStage::Extract;
551
552 log::debug!(
554 "XFA flatten: {} bytes input, template={} bytes",
555 pdf_bytes.len(),
556 template_xml.len()
557 );
558
559 let data_dom = if let Some(ds_xml) = datasets_xml {
560 DataDom::from_xml(ds_xml)
561 .map_err(|e| XfaError::ParseFailed(format!("datasets parse: {e}")))?
562 } else {
563 DataDom::new()
564 };
565
566 let image_files = match Document::load_mem(pdf_bytes) {
569 Ok(doc) => extract_embedded_images(&doc),
570 Err(_) => HashMap::new(),
571 };
572
573 if template_xml.contains("barcode") {
577 log::warn!("XFA barcode elements found but not supported — rendered as empty boxes");
578 }
579 if template_xml.contains("<signature") || template_xml.contains("<Signature") {
580 log::warn!("XFA signature elements found but not supported — elements skipped");
581 }
582 if javascript_policy::template_mentions_javascript(template_xml) {
583 log::warn!(
584 "{}",
585 javascript_policy::execution_denied_message(JavaScriptEntryPoint::XfaEventHook)
586 );
587 }
588
589 debug_assert!(
591 _stage <= PipelineStage::Bind,
592 "pipeline stage order violated: expected <= Bind"
593 );
594 _stage = PipelineStage::Bind;
595
596 let merger = FormMerger::new(&data_dom).with_image_files(image_files);
597 let (mut tree, root_id) = merger
598 .merge(template_xml)
599 .map_err(|e| XfaError::ParseFailed(format!("template merge: {e}")))?;
600
601 log::debug!("XFA bind: {} form nodes created", tree.nodes.len());
602
603 let bind_reason = if tree.any_data_bound {
610 TraceReason::SubformMaterialisedFromData
611 } else {
612 TraceReason::SubformSuppressedNoData
613 };
614 trace_sites::bind(
615 "root",
616 bind_reason,
617 format!(
618 "form_nodes={} any_data_bound={}",
619 tree.nodes.len(),
620 tree.any_data_bound
621 ),
622 );
623
624 let dynamic_scripts = match std::env::var("XFA_JS_EXECUTION_MODE")
635 .ok()
636 .map(|s| s.to_ascii_lowercase())
637 .as_deref()
638 {
639 Some("strict") => {
640 apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::Strict)?
641 }
642 Some("sandboxed") | Some("sandboxed_runtime") => {
643 #[cfg(feature = "xfa-js-sandboxed")]
647 {
648 use crate::js_runtime::{NullRuntime, QuickJsRuntime, XfaJsRuntime};
649 match QuickJsRuntime::new() {
650 Ok(mut rt) => {
651 rt.set_data_handle(&data_dom as *const _);
652 apply_dynamic_scripts_with_runtime(
653 &mut tree,
654 root_id,
655 JsExecutionMode::SandboxedRuntime,
656 &mut rt,
657 )?
658 }
659 Err(_) => apply_dynamic_scripts_with_runtime(
660 &mut tree,
661 root_id,
662 JsExecutionMode::SandboxedRuntime,
663 &mut NullRuntime::new(),
664 )?,
665 }
666 }
667 #[cfg(not(feature = "xfa-js-sandboxed"))]
668 apply_dynamic_scripts_with_mode(&mut tree, root_id, JsExecutionMode::SandboxedRuntime)?
669 }
670 _ => apply_dynamic_scripts(&mut tree, root_id)?,
671 };
672 if dynamic_scripts.output_quality != OutputQuality::Exact {
673 log::warn!(
677 "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={}",
678 dynamic_scripts.output_quality.as_str(),
679 dynamic_scripts.js_present,
680 dynamic_scripts.js_skipped,
681 dynamic_scripts.other_skipped,
682 dynamic_scripts.formcalc_run,
683 dynamic_scripts.formcalc_errors,
684 dynamic_scripts.js_executed,
685 dynamic_scripts.js_runtime_errors,
686 dynamic_scripts.js_timeouts,
687 dynamic_scripts.js_oom,
688 dynamic_scripts.js_host_calls,
689 dynamic_scripts.js_mutations,
690 dynamic_scripts.js_instance_writes,
691 dynamic_scripts.js_list_writes,
692 dynamic_scripts.js_binding_errors,
693 dynamic_scripts.js_resolve_failures,
694 dynamic_scripts.js_data_reads,
695 );
696 eprintln!(
697 "XFA script metadata: output_quality={} js_present={} js_skipped={} other_skipped={} formcalc_run={} formcalc_errors={} js_executed={} js_runtime_errors={} js_timeouts={} js_oom={} js_host_calls={} js_mutations={} js_instance_writes={} js_list_writes={} js_binding_errors={} js_resolve_failures={} js_data_reads={}",
698 dynamic_scripts.output_quality.as_str(),
699 dynamic_scripts.js_present,
700 dynamic_scripts.js_skipped,
701 dynamic_scripts.other_skipped,
702 dynamic_scripts.formcalc_run,
703 dynamic_scripts.formcalc_errors,
704 dynamic_scripts.js_executed,
705 dynamic_scripts.js_runtime_errors,
706 dynamic_scripts.js_timeouts,
707 dynamic_scripts.js_oom,
708 dynamic_scripts.js_host_calls,
709 dynamic_scripts.js_mutations,
710 dynamic_scripts.js_instance_writes,
711 dynamic_scripts.js_list_writes,
712 dynamic_scripts.js_binding_errors,
713 dynamic_scripts.js_resolve_failures,
714 dynamic_scripts.js_data_reads,
715 );
716 }
717
718 if let Some(fxml) = form_xml {
724 apply_form_dom_presence(&mut tree, root_id, fxml);
725 }
726
727 let resolved_fonts = resolve_template_fonts(template_xml, pdf_bytes);
730 inject_resolved_metrics(&mut tree, &resolved_fonts);
731
732 debug_assert!(
734 _stage <= PipelineStage::Layout,
735 "pipeline stage order violated: expected <= Layout"
736 );
737 _stage = PipelineStage::Layout;
738
739 let (mut layout, mut layout_dump) = {
740 let engine = LayoutEngine::new(&tree);
741 if collect_layout_dump {
742 let (layout, profile) = engine
743 .layout_with_profile(root_id)
744 .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
745 (layout, Some(layout_dump_from_profile(profile)))
746 } else {
747 let layout = engine
748 .layout(root_id)
749 .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))?;
750 (layout, None)
751 }
752 };
753
754 if layout.pages.is_empty() {
755 return Err(XfaError::LayoutFailed("layout produced 0 pages".into()));
756 }
757
758 log::debug!("XFA layout: {} pages produced", layout.pages.len());
759
760 let form_dom_pages = form_xml.and_then(form_dom_page_count).unwrap_or(0);
765 trace_sites::paginate(
766 "root",
767 TraceReason::PaginateFitsCurrentPage,
768 layout.pages.len() as f64,
769 form_dom_pages as f64,
770 );
771
772 if let Some(fdp_count) = form_xml.and_then(form_dom_page_count) {
792 if layout.pages.len() < fdp_count {
793 log::debug!(
794 "XFA layout: form_dom declared {} pages but layout produced {} — \
795 re-running without form-dom presence overrides",
796 fdp_count,
797 layout.pages.len(),
798 );
799 let image_files2 = match lopdf::Document::load_mem(pdf_bytes) {
800 Ok(doc) => extract_embedded_images(&doc),
801 Err(_) => HashMap::new(),
802 };
803 let merge_result2 = FormMerger::new(&data_dom)
804 .with_image_files(image_files2)
805 .merge(template_xml)
806 .map_err(|e| XfaError::ParseFailed(format!("template re-merge: {e}")));
807 if let Ok((mut tree2, root_id2)) = merge_result2 {
808 inject_resolved_metrics(&mut tree2, &resolved_fonts);
809 let layout2_result = {
810 let engine2 = LayoutEngine::new(&tree2);
811 if collect_layout_dump {
812 engine2
813 .layout_with_profile(root_id2)
814 .map(|(l, p)| (l, Some(layout_dump_from_profile(p))))
815 .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))
816 } else {
817 engine2
818 .layout(root_id2)
819 .map(|l| (l, None))
820 .map_err(|e| XfaError::LayoutFailed(format!("{e:?}")))
821 }
822 };
823 if let Ok((layout2, layout_dump2)) = layout2_result {
824 if layout2.pages.len() > layout.pages.len() && layout2.pages.len() == fdp_count
829 {
830 tree = tree2;
831 layout = layout2;
832 layout_dump = layout_dump2;
833 log::debug!(
834 "XFA layout: re-run produced {} pages — using template-only layout",
835 layout.pages.len()
836 );
837 }
838 }
839 }
840 }
841 }
842
843 let preflight =
867 suppress_empty_pages_only_when_real_data_bound(layout.pages.len(), tree.any_data_bound);
868 if preflight.run_suppression {
869 let cap_decision =
875 cap_suppression_by_form_dom(layout.pages.len(), form_xml.and_then(form_dom_page_count));
876 let max_suppress = cap_decision.max_suppress;
877
878 let bind_none_count: Cell<usize> = Cell::new(0);
882 let widget_count: Cell<usize> = Cell::new(0);
885
886 let mut suppressed = 0usize;
887 let keep: Vec<bool> = layout
888 .pages
889 .iter()
890 .enumerate()
891 .map(|(page_index, p)| {
892 if page_has_fields(&p.nodes, &tree, &bind_none_count, &widget_count)
893 && !page_has_field_data(&p.nodes, &tree)
894 && suppressed < max_suppress
895 {
896 suppressed += 1;
897 trace_sites::suppress(
898 TraceReason::SuppressEmptyDataPageDropped,
899 page_index as u32,
900 "data_empty_page_dropped",
901 );
902 false
903 } else {
904 true
905 }
906 })
907 .collect();
908
909 emit_bind_none_summary(bind_none_count.get());
910 emit_non_data_widget_summary(widget_count.get());
911 let any_keep = keep.iter().any(|&k| k);
912 if any_keep {
913 let mut idx = 0;
914 layout.pages.retain(|_| {
915 let k = keep[idx];
916 idx += 1;
917 k
918 });
919 if let Some(ref mut dump) = layout_dump {
920 let mut idx = 0;
921 dump.pages.retain(|_| {
922 let k = keep[idx];
923 idx += 1;
924 k
925 });
926 }
927 }
928 }
932
933 if let Some(ref mut dump) = layout_dump {
934 renumber_layout_dump_pages(dump);
935 }
936
937 debug_assert!(
939 _stage <= PipelineStage::Render,
940 "pipeline stage order violated: expected <= Render"
941 );
942 _stage = PipelineStage::Render;
943
944 let mut doc = match Document::load_mem(pdf_bytes) {
945 Ok(d) => d,
946 Err(_) => {
947 eprintln!("lopdf load failed, creating minimal PDF structure for XFA layout");
948 create_minimal_pdf_document()
949 }
950 };
951
952 debug_assert!(
954 _stage <= PipelineStage::Embed,
955 "pipeline stage order violated: expected <= Embed"
956 );
957 _stage = PipelineStage::Embed;
958
959 let (font_map, embedded_font_objects, metrics_data) =
966 embed_resolved_fonts(&mut doc, &resolved_fonts, &layout);
967
968 let config = XfaRenderConfig {
969 font_map,
970 font_metrics_data: metrics_data,
971 ..Default::default()
972 };
973
974 let overlays = generate_all_overlays(&layout, &config)
975 .map_err(|e| XfaError::LayoutFailed(format!("overlay generation: {e:?}")))?;
976
977 log::debug!(
978 "XFA render: {} content streams generated ({} bytes total)",
979 overlays.len(),
980 overlays
981 .iter()
982 .map(|o| o.content_stream.len())
983 .sum::<usize>()
984 );
985
986 let font_ids: [ObjectId; 3] = [
988 doc.add_object(Object::Dictionary(dictionary! {
989 "Type" => Object::Name(b"Font".to_vec()),
990 "Subtype" => Object::Name(b"Type1".to_vec()),
991 "BaseFont" => Object::Name(b"Times-Roman".to_vec()),
992 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
993 })),
994 doc.add_object(Object::Dictionary(dictionary! {
995 "Type" => Object::Name(b"Font".to_vec()),
996 "Subtype" => Object::Name(b"Type1".to_vec()),
997 "BaseFont" => Object::Name(b"Helvetica".to_vec()),
998 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
999 })),
1000 doc.add_object(Object::Dictionary(dictionary! {
1001 "Type" => Object::Name(b"Font".to_vec()),
1002 "Subtype" => Object::Name(b"Type1".to_vec()),
1003 "BaseFont" => Object::Name(b"Courier".to_vec()),
1004 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec())
1005 })),
1006 ];
1007
1008 let existing_page_ids: Vec<ObjectId> = doc.page_iter().collect();
1009 let n_layout = overlays.len();
1010 let n_existing = existing_page_ids.len();
1011
1012 let is_static_form = template_xml.contains("baseProfile=\"interactiveForms\"");
1025 let has_static_content = pages_have_static_content(&doc);
1026
1027 let overlay_is_substantial = overlays.iter().any(|o| o.content_stream.len() > 1000);
1051 let preserve_static = is_static_form
1063 || n_layout < n_existing
1064 || (n_layout <= n_existing && has_static_content && overlay_is_substantial);
1065
1066 debug_assert!(
1068 _stage <= PipelineStage::Write,
1069 "pipeline stage order violated: expected <= Write"
1070 );
1071 _stage = PipelineStage::Write;
1072
1073 if preserve_static {
1074 let baked = flatten_widget_appearances(&mut doc);
1075 if baked == 0 {
1076 if let Ok(fv_overlays) = generate_field_values_overlays(&layout, &config) {
1082 for (i, overlay) in fv_overlays.iter().enumerate() {
1083 if i < n_existing && !overlay.content_stream.is_empty() {
1084 let _ = overlay_page_content(
1085 &mut doc,
1086 existing_page_ids[i],
1087 overlay,
1088 &font_ids,
1089 &embedded_font_objects,
1090 );
1091 }
1092 }
1093 }
1094 }
1095 } else {
1100 for (i, overlay) in overlays.iter().enumerate() {
1108 if i < n_existing {
1109 let lp = &layout.pages[i];
1110 write_page_content(
1111 &mut doc,
1112 existing_page_ids[i],
1113 overlay,
1114 &font_ids,
1115 &embedded_font_objects,
1116 Some(lp.width),
1117 Some(lp.height),
1118 )?;
1119 } else {
1120 let lp = &layout.pages[i];
1121 add_new_page(
1122 &mut doc,
1123 lp.width,
1124 lp.height,
1125 overlay,
1126 &font_ids,
1127 &embedded_font_objects,
1128 )?;
1129 }
1130 }
1131
1132 for &page_id in &existing_page_ids[..n_existing.min(n_layout)] {
1137 bake_checkbox_radio_ap_marks(&mut doc, page_id);
1138 }
1139 }
1140
1141 let template_has_dynamic_logic = template_xml.contains("<script")
1174 || template_xml.contains(r#"contentType="application/x-formcalc""#);
1175 let trim_decision = static_xfaf_excess_page_trim_with_form_dom_guard(
1176 is_static_form,
1177 template_has_dynamic_logic,
1178 n_layout,
1179 form_xml.and_then(form_dom_page_count),
1180 );
1181 let static_can_trim = trim_decision.allow_trim;
1182 if n_layout < n_existing && (!preserve_static || static_can_trim) {
1183 let excess: Vec<u32> = ((n_layout + 1) as u32..=(n_existing as u32))
1186 .rev()
1187 .collect();
1188 doc.delete_pages(&excess);
1189 }
1190
1191 if is_static_form {
1192 for &page_id in &existing_page_ids {
1198 strip_widget_annotations(&mut doc, page_id);
1199 }
1200 } else {
1201 for &page_id in existing_page_ids.iter().take(n_layout.min(n_existing)) {
1204 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
1205 dict.remove(b"Annots");
1206 }
1207 }
1208 }
1209
1210 debug_assert!(
1212 _stage <= PipelineStage::Cleanup,
1213 "pipeline stage order violated: expected <= Cleanup"
1214 );
1215 #[allow(unused_assignments)]
1216 {
1217 _stage = PipelineStage::Cleanup;
1218 }
1219
1220 remove_acroform(&mut doc);
1221 let stripped_js = javascript_policy::strip_javascript_for_flatten(&mut doc);
1222 if stripped_js > 0 {
1223 log::warn!("stripped {stripped_js} JavaScript action(s) from flattened output");
1224 }
1225
1226 let mut out = Vec::new();
1227 doc.save_to(&mut out)
1228 .map_err(|e| XfaError::LayoutFailed(format!("save: {e}")))?;
1229 Ok(FlattenOutput::new(
1230 out,
1231 layout_dump.unwrap_or_default(),
1232 dynamic_scripts,
1233 ))
1234}
1235
1236fn layout_dump_from_profile(profile: LayoutProfile) -> LayoutDump {
1237 LayoutDump {
1238 pages: profile
1239 .pages
1240 .into_iter()
1241 .enumerate()
1242 .map(|(idx, page)| LayoutDumpEntry {
1243 page_num: idx as u32 + 1,
1244 page_height: page.page_height,
1245 used_height: page.used_height,
1246 overflow_to_next: page.overflow_to_next,
1247 first_overflow_element: page.first_overflow_element,
1248 })
1249 .collect(),
1250 ..Default::default()
1251 }
1252}
1253
1254fn renumber_layout_dump_pages(dump: &mut LayoutDump) {
1255 for (idx, page) in dump.pages.iter_mut().enumerate() {
1256 page.page_num = idx as u32 + 1;
1257 }
1258}
1259
1260fn extract_embedded_images(doc: &Document) -> HashMap<String, Vec<u8>> {
1270 let mut images = HashMap::new();
1271
1272 fn deref_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
1274 match obj {
1275 Object::Reference(id) => doc.get_dictionary(*id).ok(),
1276 Object::Dictionary(d) => Some(d),
1277 _ => None,
1278 }
1279 }
1280
1281 fn extract_stream(doc: &Document, obj: &Object) -> Option<Vec<u8>> {
1283 let stream_obj = match obj {
1284 Object::Reference(id) => doc.get_object(*id).ok()?,
1285 other => other,
1286 };
1287 if let Object::Stream(ref stream) = *stream_obj {
1288 let mut s = stream.clone();
1289 let _ = s.decompress();
1290 Some(s.content.clone())
1291 } else {
1292 None
1293 }
1294 }
1295
1296 let catalog = match doc.catalog() {
1298 Ok(c) => c,
1299 Err(_) => return images,
1300 };
1301 let names_obj = match catalog.get(b"Names") {
1302 Ok(obj) => obj,
1303 Err(_) => {
1304 eprintln!("[img-href] no /Names in catalog");
1305 return images;
1306 }
1307 };
1308 let names_dict = match deref_dict(doc, names_obj) {
1309 Some(d) => d,
1310 None => return images,
1311 };
1312 let ef_obj = match names_dict
1315 .get(b"XFAImages")
1316 .or_else(|_| names_dict.get(b"EmbeddedFiles"))
1317 {
1318 Ok(obj) => obj,
1319 Err(_) => return images,
1320 };
1321 let ef_dict = match deref_dict(doc, ef_obj) {
1322 Some(d) => d,
1323 None => return images,
1324 };
1325
1326 let names_arr_obj = match ef_dict.get(b"Names") {
1328 Ok(obj) => obj,
1329 Err(_) => return images,
1330 };
1331 let names_array = match names_arr_obj {
1332 Object::Array(arr) => arr,
1333 Object::Reference(id) => match doc.get_object(*id) {
1334 Ok(Object::Array(arr)) => arr,
1335 _ => return images,
1336 },
1337 _ => return images,
1338 };
1339
1340 let mut i = 0;
1342 while i + 1 < names_array.len() {
1343 let name = match &names_array[i] {
1344 Object::String(bytes, _) => String::from_utf8_lossy(bytes).to_string(),
1345 _ => {
1346 i += 2;
1347 continue;
1348 }
1349 };
1350
1351 let value_ref = &names_array[i + 1];
1355
1356 if let Some(filespec) = deref_dict(doc, value_ref) {
1358 if let Ok(ef_obj) = filespec.get(b"EF") {
1359 if let Some(ef) = deref_dict(doc, ef_obj) {
1360 if let Ok(f_ref) = ef.get(b"F") {
1361 if let Some(data) = extract_stream(doc, f_ref) {
1362 images.insert(name.clone(), data);
1363 i += 2;
1364 continue;
1365 }
1366 }
1367 }
1368 }
1369 }
1370
1371 if let Some(data) = extract_stream(doc, value_ref) {
1373 images.insert(name.clone(), data);
1374 }
1375
1376 i += 2;
1377 }
1378 images
1379}
1380
1381#[doc(hidden)]
1386pub fn extract_embedded_fonts(doc: &Document) -> Vec<EmbeddedFontData> {
1387 let mut fonts = Vec::new();
1388 let mut seen = std::collections::HashSet::new();
1389 for (&font_object_id, obj) in &doc.objects {
1390 let dict = match obj.as_dict() {
1391 Ok(d) => d,
1392 Err(_) => continue,
1393 };
1394 let is_font =
1395 dict.get(b"Type").ok().and_then(|o| o.as_name().ok()) == Some(b"Font".as_slice());
1396 if !is_font {
1397 continue;
1398 }
1399 let base_font = match dict.get(b"BaseFont").ok().and_then(|o| o.as_name().ok()) {
1400 Some(n) => String::from_utf8_lossy(n).to_string(),
1401 None => continue,
1402 };
1403
1404 let pdf_widths = extract_font_widths(dict);
1405 let pdf_encoding = extract_font_encoding(doc, dict);
1406 let pdf_source_font =
1407 extract_simple_pdf_source_font(doc, font_object_id, dict, pdf_widths.as_ref());
1408
1409 if let Some((stream_id, data)) = extract_font_from_direct_fd(doc, dict, &base_font) {
1411 if seen.insert(stream_id) {
1412 store_font_data(
1413 &mut fonts,
1414 &base_font,
1415 data,
1416 pdf_widths.clone(),
1417 pdf_encoding.clone(),
1418 pdf_source_font,
1419 );
1420 }
1421 continue;
1422 }
1423
1424 if let Some((stream_id, data)) = extract_cidfont_data(doc, dict, &base_font, &seen) {
1428 if seen.insert(stream_id) {
1429 let cid_widths = extract_cid_font_widths(doc, dict);
1430 store_font_data(&mut fonts, &base_font, data, cid_widths, None, None);
1431 }
1432 continue;
1433 }
1434
1435 if let Some(source_font) = pdf_source_font {
1436 store_font_data(
1442 &mut fonts,
1443 &base_font,
1444 Vec::new(),
1445 pdf_widths.clone(),
1446 pdf_encoding.clone(),
1447 Some(source_font),
1448 );
1449 }
1450 }
1451 fonts
1452}
1453
1454fn extract_font_widths(dict: &lopdf::Dictionary) -> Option<(u16, Vec<u16>)> {
1456 let first_char = dict.get(b"FirstChar").ok()?.as_i64().ok()? as u16;
1457 let _last_char = dict.get(b"LastChar").ok()?.as_i64().ok()? as u16;
1458 let widths_array = dict.get(b"Widths").ok()?.as_array().ok()?;
1459 let widths: Vec<u16> = widths_array
1460 .iter()
1461 .filter_map(|w| w.as_i64().ok().map(|v| v as u16))
1462 .collect();
1463 if widths.is_empty() {
1464 return None;
1465 }
1466 Some((first_char, widths))
1467}
1468
1469fn extract_cid_font_widths(
1486 doc: &Document,
1487 type0_dict: &lopdf::Dictionary,
1488) -> Option<(u16, Vec<u16>)> {
1489 let descendants = type0_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
1490 let desc_ref = descendants.first()?;
1491 let cid_dict = match desc_ref {
1492 Object::Reference(id) => doc.get_dictionary(*id).ok()?,
1493 Object::Dictionary(d) => d,
1494 _ => return None,
1495 };
1496
1497 let default_width = cid_dict
1498 .get(b"DW")
1499 .ok()
1500 .and_then(|o| o.as_i64().ok())
1501 .unwrap_or(1000) as u16;
1502
1503 let w_array = cid_dict.get(b"W").ok()?;
1504 let w_array = match resolve_object(doc, w_array) {
1505 Some(obj) => obj.as_array().ok()?,
1506 None => return None,
1507 };
1508
1509 if w_array.is_empty() {
1510 return None;
1511 }
1512
1513 let mut entries: Vec<(u16, u16)> = Vec::new();
1515 let mut i = 0;
1516 while i < w_array.len() {
1517 let cid_start = match w_array[i].as_i64() {
1518 Ok(v) => v as u16,
1519 Err(_) => {
1520 i += 1;
1521 continue;
1522 }
1523 };
1524 i += 1;
1525 if i >= w_array.len() {
1526 break;
1527 }
1528
1529 if let Ok(widths_arr) = w_array[i].as_array() {
1531 for (j, w_obj) in widths_arr.iter().enumerate() {
1533 if let Ok(w) = w_obj.as_i64() {
1534 entries.push((cid_start + j as u16, w as u16));
1535 }
1536 }
1537 i += 1;
1538 } else if let Ok(cid_last) = w_array[i].as_i64() {
1539 i += 1;
1541 if i >= w_array.len() {
1542 break;
1543 }
1544 if let Ok(width) = w_array[i].as_i64() {
1545 let cid_last = cid_last as u16;
1546 for cid in cid_start..=cid_last {
1547 entries.push((cid, width as u16));
1548 }
1549 }
1550 i += 1;
1551 } else {
1552 i += 1;
1553 }
1554 }
1555
1556 if entries.is_empty() {
1557 return None;
1558 }
1559
1560 let min_cid = entries.iter().map(|(c, _)| *c).min().unwrap();
1561 let max_cid = entries.iter().map(|(c, _)| *c).max().unwrap();
1562 let len = (max_cid - min_cid + 1) as usize;
1563 let mut widths = vec![default_width; len];
1564 for (cid, w) in &entries {
1565 widths[(*cid - min_cid) as usize] = *w;
1566 }
1567
1568 Some((min_cid, widths))
1569}
1570
1571fn extract_font_encoding(doc: &Document, dict: &lopdf::Dictionary) -> Option<PdfSimpleEncoding> {
1583 let encoding_obj = resolve_object(doc, dict.get(b"Encoding").ok()?)?;
1584 let encoding_dict = encoding_obj.as_dict().ok()?;
1585 let differences_array = resolve_object(doc, encoding_dict.get(b"Differences").ok()?)?
1586 .as_array()
1587 .ok()?;
1588
1589 let base_encoding = encoding_dict
1590 .get(b"BaseEncoding")
1591 .ok()
1592 .and_then(|obj| resolve_object(doc, obj))
1593 .and_then(|obj| obj.as_name().ok())
1594 .and_then(PdfBaseEncoding::from_pdf_name)
1595 .unwrap_or(PdfBaseEncoding::WinAnsi);
1596
1597 let mut differences = Vec::new();
1598 let mut current_code: Option<u8> = None;
1599 for item in differences_array {
1600 let item = resolve_object(doc, item)?;
1601 if let Ok(code) = item.as_i64() {
1602 current_code = u8::try_from(code).ok();
1603 continue;
1604 }
1605
1606 let Some(name) = item.as_name().ok() else {
1607 continue;
1608 };
1609 let Some(code) = current_code else {
1610 continue;
1611 };
1612 let Some(glyph_name) = std::str::from_utf8(name).ok() else {
1613 continue;
1614 };
1615 if let Some(unicode) = pdf_glyph_name_to_unicode(glyph_name) {
1616 differences.push((code, unicode));
1617 }
1618 current_code = code.checked_add(1);
1619 }
1620
1621 if differences.is_empty() {
1622 return None;
1623 }
1624
1625 Some(PdfSimpleEncoding {
1626 base_encoding,
1627 differences,
1628 })
1629}
1630
1631fn extract_simple_pdf_source_font(
1632 doc: &Document,
1633 font_object_id: ObjectId,
1634 dict: &lopdf::Dictionary,
1635 pdf_widths: Option<&(u16, Vec<u16>)>,
1636) -> Option<PdfSourceFont> {
1637 pdf_widths?;
1638
1639 let subtype = dict.get(b"Subtype").ok().and_then(|obj| obj.as_name().ok());
1640 if subtype == Some(b"Type0".as_slice()) {
1641 return None;
1642 }
1643
1644 let encoding_obj = dict
1653 .get(b"Encoding")
1654 .ok()
1655 .and_then(|obj| resolve_object(doc, obj));
1656 match encoding_obj {
1657 Some(obj) if obj.as_name().ok() == Some(b"WinAnsiEncoding".as_slice()) => {}
1658 Some(obj) => {
1659 let base = obj
1660 .as_dict()
1661 .ok()
1662 .and_then(|enc| enc.get(b"BaseEncoding").ok())
1663 .and_then(|base| resolve_object(doc, base))
1664 .and_then(|base| base.as_name().ok());
1665 if base != Some(b"WinAnsiEncoding".as_slice()) {
1666 return None;
1667 }
1668 if obj
1669 .as_dict()
1670 .ok()
1671 .and_then(|enc| enc.get(b"Differences").ok())
1672 .is_some()
1673 {
1674 return None;
1675 }
1676 }
1677 None => return None,
1678 }
1679
1680 Some(PdfSourceFont {
1681 object_id: font_object_id,
1682 })
1683}
1684
1685fn resolve_object<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
1686 match obj {
1687 Object::Reference(id) => doc.get_object(*id).ok(),
1688 other => Some(other),
1689 }
1690}
1691
1692fn extract_font_from_direct_fd(
1694 doc: &Document,
1695 font_dict: &lopdf::Dictionary,
1696 _base_font: &str,
1697) -> Option<(lopdf::ObjectId, Vec<u8>)> {
1698 let fd_id = font_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
1699 let fd = doc.get_dictionary(fd_id).ok()?;
1700
1701 let font_stream_id = fd
1702 .get(b"FontFile2")
1703 .or_else(|_| fd.get(b"FontFile3"))
1704 .or_else(|_| fd.get(b"FontFile"))
1705 .ok()?
1706 .as_reference()
1707 .ok()?;
1708
1709 let stream = doc
1710 .get_object(font_stream_id)
1711 .and_then(|o| o.as_stream())
1712 .ok()?;
1713
1714 let data = stream
1715 .get_plain_content()
1716 .unwrap_or_else(|_| stream.content.clone());
1717
1718 if data.is_empty() {
1719 return None;
1720 }
1721
1722 Some((font_stream_id, data))
1723}
1724
1725fn extract_cidfont_data(
1730 doc: &Document,
1731 font_dict: &lopdf::Dictionary,
1732 _base_font: &str,
1733 seen: &std::collections::HashSet<lopdf::ObjectId>,
1734) -> Option<(lopdf::ObjectId, Vec<u8>)> {
1735 let descendants = font_dict.get(b"DescendantFonts").ok()?.as_array().ok()?;
1737
1738 for desc_ref in descendants {
1740 let desc_id = desc_ref.as_reference().ok()?;
1741 let desc_dict = doc.get_dictionary(desc_id).ok()?;
1742
1743 let fd_id = desc_dict.get(b"FontDescriptor").ok()?.as_reference().ok()?;
1745 let fd = doc.get_dictionary(fd_id).ok()?;
1746
1747 let font_stream_id = fd
1749 .get(b"FontFile3")
1750 .or_else(|_| fd.get(b"FontFile2"))
1751 .or_else(|_| fd.get(b"FontFile"))
1752 .ok()?
1753 .as_reference()
1754 .ok()?;
1755
1756 if seen.contains(&font_stream_id) {
1757 continue;
1758 }
1759
1760 let stream = doc
1761 .get_object(font_stream_id)
1762 .and_then(|o| o.as_stream())
1763 .ok()?;
1764
1765 let data = stream
1766 .get_plain_content()
1767 .unwrap_or_else(|_| stream.content.clone());
1768
1769 if !data.is_empty() {
1770 return Some((font_stream_id, data));
1771 }
1772 }
1773 None
1774}
1775
1776fn store_font_data(
1778 fonts: &mut Vec<EmbeddedFontData>,
1779 base_font: &str,
1780 data: Vec<u8>,
1781 pdf_widths: Option<(u16, Vec<u16>)>,
1782 pdf_encoding: Option<PdfSimpleEncoding>,
1783 pdf_source_font: Option<PdfSourceFont>,
1784) {
1785 let clean_name = if let Some(pos) = base_font.find('+') {
1786 base_font[pos + 1..].to_string()
1787 } else {
1788 base_font.to_string()
1789 };
1790 let allow_family_alias = family_alias_is_regular_face(&clean_name, &data);
1791
1792 fonts.push(EmbeddedFontData {
1794 name: clean_name.clone(),
1795 data: data.clone(),
1796 pdf_widths: pdf_widths.clone(),
1797 pdf_encoding: pdf_encoding.clone(),
1798 pdf_source_font,
1799 });
1800
1801 if let Ok(face) = ttf_parser::Face::parse(&data, 0) {
1805 for name_record in face.names() {
1806 let allow_alias = match name_record.name_id {
1807 ttf_parser::name_id::FAMILY => allow_family_alias,
1808 ttf_parser::name_id::FULL_NAME | ttf_parser::name_id::POST_SCRIPT_NAME => true,
1809 _ => false,
1810 };
1811 if !allow_alias {
1812 continue;
1813 }
1814 if let Some(alias) = name_record.to_string() {
1815 if alias != clean_name {
1816 fonts.push(EmbeddedFontData {
1817 name: alias,
1818 data: data.clone(),
1819 pdf_widths: pdf_widths.clone(),
1820 pdf_encoding: pdf_encoding.clone(),
1821 pdf_source_font,
1822 });
1823 }
1824 }
1825 }
1826 }
1827
1828 let normalized = ps_name_to_family(&clean_name);
1832 if allow_family_alias && normalized != clean_name {
1833 fonts.push(EmbeddedFontData {
1834 name: normalized,
1835 data,
1836 pdf_widths,
1837 pdf_encoding,
1838 pdf_source_font,
1839 });
1840 }
1841}
1842
1843fn family_alias_is_regular_face(clean_name: &str, data: &[u8]) -> bool {
1844 if let Ok(face) = ttf_parser::Face::parse(data, 0) {
1845 if face.is_bold() || face.is_italic() {
1846 return false;
1847 }
1848 }
1849
1850 let lower = clean_name.to_ascii_lowercase();
1851 !lower.contains("bold") && !lower.contains("italic") && !lower.contains("oblique")
1852}
1853
1854fn ps_name_to_family(ps_name: &str) -> String {
1859 let base = ps_name
1861 .strip_suffix("PSMT")
1862 .or_else(|| ps_name.strip_suffix("PS-BoldItalicMT"))
1863 .or_else(|| ps_name.strip_suffix("PS-BoldMT"))
1864 .or_else(|| ps_name.strip_suffix("PS-ItalicMT"))
1865 .or_else(|| ps_name.strip_suffix("-BoldItalicMT"))
1866 .or_else(|| ps_name.strip_suffix("-BoldMT"))
1867 .or_else(|| ps_name.strip_suffix("-ItalicMT"))
1868 .or_else(|| ps_name.strip_suffix("MT"))
1869 .or_else(|| ps_name.strip_suffix("-Regular"))
1870 .or_else(|| ps_name.strip_suffix("-Bold"))
1871 .or_else(|| ps_name.strip_suffix("-Italic"))
1872 .or_else(|| ps_name.strip_suffix("-BoldItalic"))
1873 .unwrap_or(ps_name);
1874 let mut result = String::with_capacity(base.len() + 4);
1877 for (i, ch) in base.chars().enumerate() {
1878 if i > 0 && ch.is_uppercase() {
1879 let prev = base.as_bytes()[i - 1] as char;
1880 if prev.is_lowercase() {
1881 result.push(' ');
1882 }
1883 }
1884 result.push(ch);
1885 }
1886 result
1887}
1888
1889struct TemplateFontEntry {
1891 typeface: String,
1892 weight: Option<String>,
1893 posture: Option<String>,
1894 generic_family: Option<String>,
1895}
1896
1897fn collect_template_font_entries(template_xml: &str) -> Vec<TemplateFontEntry> {
1898 let mut entries = Vec::new();
1899 let mut seen = std::collections::HashSet::new();
1900 if let Ok(xml_doc) = roxmltree::Document::parse(template_xml) {
1901 for node in xml_doc.descendants() {
1902 if node.tag_name().name() == "font" {
1903 if let Some(typeface) = node.attribute("typeface") {
1904 let name = typeface.to_string();
1905 let weight = node.attribute("weight").map(|s| s.to_string());
1906 let posture = node.attribute("posture").map(|s| s.to_string());
1907 let generic_family = node.attribute("genericFamily").map(|s| s.to_string());
1908 let key = font_variant_key(&name, weight.as_deref(), posture.as_deref());
1909 if !name.is_empty() && seen.insert(key.to_lowercase()) {
1910 entries.push(TemplateFontEntry {
1911 typeface: name,
1912 weight,
1913 posture,
1914 generic_family,
1915 });
1916 }
1917 }
1918 }
1919 }
1920 }
1921 entries
1922}
1923
1924fn embed_font_in_pdf(doc: &mut Document, font: &ResolvedFont) -> ObjectId {
1925 let font_stream = Stream::new(
1926 dictionary! {
1927 "Length" => Object::Integer(font.data.len() as i64),
1928 "Length1" => Object::Integer(font.data.len() as i64)
1929 },
1930 font.data.clone(),
1931 );
1932 let font_file_id = doc.add_object(Object::Stream(font_stream));
1933
1934 let upem = font.units_per_em as f64;
1935 let scale = 1000.0 / upem.max(1.0);
1936 let ascent = (font.ascender as f64 * scale) as i64;
1937 let descent = (font.descender as f64 * scale) as i64;
1938 let cap_height = (ascent as f64 * 0.7) as i64;
1939 let base_name = font.name.replace(' ', "-");
1940
1941 let fd = dictionary! {
1942 "Type" => Object::Name(b"FontDescriptor".to_vec()),
1943 "FontName" => Object::Name(base_name.as_bytes().to_vec()),
1944 "Flags" => Object::Integer(32),
1945 "FontBBox" => Object::Array(vec![
1946 Object::Integer(0),
1947 Object::Integer(descent),
1948 Object::Integer(1000),
1949 Object::Integer(ascent),
1950 ]),
1951 "ItalicAngle" => Object::Integer(0),
1952 "Ascent" => Object::Integer(ascent),
1953 "Descent" => Object::Integer(descent),
1954 "CapHeight" => Object::Integer(cap_height),
1955 "StemV" => Object::Integer(80),
1956 "FontFile2" => Object::Reference(font_file_id)
1957 };
1958 let fd_id = doc.add_object(Object::Dictionary(fd));
1959
1960 let cid_info = font.cid_font_info().unwrap_or(CidFontInfo {
1962 widths: vec![500],
1963 gid_to_unicode: vec![],
1964 });
1965
1966 let widths_inner: Vec<Object> = cid_info
1968 .widths
1969 .iter()
1970 .map(|&w| Object::Integer(w as i64))
1971 .collect();
1972 let w_array = vec![Object::Integer(0), Object::Array(widths_inner)];
1973
1974 let cid_font = dictionary! {
1975 "Type" => Object::Name(b"Font".to_vec()),
1976 "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
1977 "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
1978 "CIDSystemInfo" => Object::Dictionary(dictionary! {
1979 "Registry" => Object::String(b"Adobe".to_vec(), StringFormat::Literal),
1980 "Ordering" => Object::String(b"Identity".to_vec(), StringFormat::Literal),
1981 "Supplement" => Object::Integer(0)
1982 }),
1983 "FontDescriptor" => Object::Reference(fd_id),
1984 "W" => Object::Array(w_array),
1985 "CIDToGIDMap" => Object::Name(b"Identity".to_vec())
1986 };
1987 let cid_font_id = doc.add_object(Object::Dictionary(cid_font));
1988
1989 let tounicode_data = generate_tounicode_cmap(&cid_info.gid_to_unicode);
1991 let tounicode_stream = Stream::new(
1992 dictionary! { "Length" => Object::Integer(tounicode_data.len() as i64) },
1993 tounicode_data,
1994 );
1995 let tounicode_id = doc.add_object(Object::Stream(tounicode_stream));
1996
1997 let type0_font = dictionary! {
1999 "Type" => Object::Name(b"Font".to_vec()),
2000 "Subtype" => Object::Name(b"Type0".to_vec()),
2001 "BaseFont" => Object::Name(base_name.as_bytes().to_vec()),
2002 "Encoding" => Object::Name(b"Identity-H".to_vec()),
2003 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
2004 "ToUnicode" => Object::Reference(tounicode_id)
2005 };
2006 doc.add_object(Object::Dictionary(type0_font))
2007}
2008
2009fn generate_tounicode_cmap(gid_to_unicode: &[(u16, char)]) -> Vec<u8> {
2011 let mut cmap = String::with_capacity(gid_to_unicode.len() * 24 + 256);
2012 cmap.push_str("/CIDInit /ProcSet findresource begin\n");
2013 cmap.push_str("12 dict begin\n");
2014 cmap.push_str("begincmap\n");
2015 cmap.push_str("/CIDSystemInfo\n");
2016 cmap.push_str("<< /Registry (Adobe) /Ordering (UCS) /Supplement 0 >> def\n");
2017 cmap.push_str("/CMapName /Adobe-Identity-UCS def\n");
2018 cmap.push_str("/CMapType 2 def\n");
2019 cmap.push_str("1 begincodespacerange\n");
2020 cmap.push_str("<0000> <FFFF>\n");
2021 cmap.push_str("endcodespacerange\n");
2022 for chunk in gid_to_unicode.chunks(100) {
2023 let _ = writeln!(cmap, "{} beginbfchar", chunk.len());
2024 for &(gid, ch) in chunk {
2025 let _ = writeln!(cmap, "<{:04X}> <{:04X}>", gid, ch as u32);
2026 }
2027 cmap.push_str("endbfchar\n");
2028 }
2029 cmap.push_str("endcmap\n");
2030 cmap.push_str("CMapName currentdict /CMap defineresource pop\n");
2031 cmap.push_str("end\nend\n");
2032 cmap.into_bytes()
2033}
2034
2035fn resolve_template_fonts(template_xml: &str, pdf_bytes: &[u8]) -> HashMap<String, ResolvedFont> {
2042 let mut resolved = HashMap::new();
2043 let entries = collect_template_font_entries(template_xml);
2044 if entries.is_empty() {
2045 return resolved;
2046 }
2047 let source_doc = match Document::load_mem(pdf_bytes) {
2048 Ok(d) => d,
2049 Err(_) => return resolved,
2050 };
2051 let embedded_fonts = extract_embedded_fonts(&source_doc);
2052 let mut resolver = XfaFontResolver::new(embedded_fonts);
2053 for entry in &entries {
2054 let spec = XfaFontSpec::from_xfa_attrs(
2055 &entry.typeface,
2056 entry.weight.as_deref(),
2057 entry.posture.as_deref(),
2058 None,
2059 entry.generic_family.as_deref(),
2060 );
2061 let key = font_variant_key(
2062 &entry.typeface,
2063 entry.weight.as_deref(),
2064 entry.posture.as_deref(),
2065 );
2066 match resolver.resolve(&spec) {
2067 Ok(font) => {
2068 resolved.insert(key, font);
2069 }
2070 Err(e) => {
2071 eprintln!("Font resolution failed for '{}': {}", entry.typeface, e);
2072 }
2073 }
2074 }
2075 resolved
2076}
2077
2078fn inject_resolved_metrics(
2087 tree: &mut xfa_layout_engine::form::FormTree,
2088 resolved: &HashMap<String, ResolvedFont>,
2089) {
2090 for i in 0..tree.nodes.len() {
2091 let id = xfa_layout_engine::form::FormNodeId(i);
2092 let style = &tree.meta(id).style;
2093 let font_family = style.font_family.clone();
2094 let font_weight = style.font_weight.clone();
2095 let font_style = style.font_style.clone();
2096 if let Some(ref family) = font_family {
2097 let variant_key =
2099 font_variant_key(family, font_weight.as_deref(), font_style.as_deref());
2100 let base_key = font_variant_key(family, None, None);
2101 let font = resolved
2102 .get(&variant_key)
2103 .or_else(|| resolved.get(&base_key));
2104 if let Some(font) = font {
2105 let (_first_char, widths) = font.pdf_glyph_widths();
2106 let node = tree.get_mut(id);
2107 node.font.resolved_widths = Some(widths);
2108 node.font.resolved_upem = Some(font.units_per_em);
2109 node.font.resolved_ascender = Some(font.ascender);
2110 node.font.resolved_descender = Some(font.descender);
2111 }
2112 }
2113 }
2114}
2115
2116fn simple_encoding_unicode_to_code_map(encoding: &PdfSimpleEncoding) -> HashMap<u16, u8> {
2121 let mut map = HashMap::new();
2122 for (code, unicode) in encoding.code_to_unicode_table().into_iter().enumerate() {
2123 if let Some(cp) = unicode {
2124 map.entry(cp).or_insert(code as u8);
2125 }
2126 }
2127 map
2128}
2129
2130fn add_text_chars_for_font(
2131 chars_by_font: &mut HashMap<String, HashSet<char>>,
2132 font_family: Option<&str>,
2133 font_weight: Option<&str>,
2134 font_style: Option<&str>,
2135 text: &str,
2136) {
2137 let Some(family) = font_family else {
2138 return;
2139 };
2140 if text.is_empty() {
2141 return;
2142 }
2143 let chars: Vec<char> = text.chars().filter(|c| !c.is_control()).collect();
2144 if chars.is_empty() {
2145 return;
2146 }
2147
2148 let variant = font_variant_key(family, font_weight, font_style);
2149 chars_by_font
2150 .entry(variant)
2151 .or_default()
2152 .extend(chars.iter().copied());
2153 chars_by_font
2154 .entry(family.to_string())
2155 .or_default()
2156 .extend(chars);
2157}
2158
2159fn add_text_chars_for_style(
2160 chars_by_font: &mut HashMap<String, HashSet<char>>,
2161 style: &FormNodeStyle,
2162 text: &str,
2163) {
2164 add_text_chars_for_font(
2165 chars_by_font,
2166 style.font_family.as_deref(),
2167 style.font_weight.as_deref(),
2168 style.font_style.as_deref(),
2169 text,
2170 );
2171}
2172
2173fn collect_used_chars_from_layout_node(
2174 node: &LayoutNode,
2175 chars_by_font: &mut HashMap<String, HashSet<char>>,
2176) {
2177 match &node.content {
2178 LayoutContent::Text(t) => add_text_chars_for_style(chars_by_font, &node.style, t),
2179 LayoutContent::Field { value, .. } => {
2180 add_text_chars_for_style(chars_by_font, &node.style, value)
2181 }
2182 LayoutContent::WrappedText { lines, .. } => {
2183 for line in lines {
2184 add_text_chars_for_style(chars_by_font, &node.style, line);
2185 }
2186 }
2187 LayoutContent::Draw(DrawContent::Text(t)) => {
2188 add_text_chars_for_style(chars_by_font, &node.style, t)
2189 }
2190 _ => {}
2191 }
2192
2193 if let Some(caption) = &node.style.caption_text {
2194 add_text_chars_for_style(chars_by_font, &node.style, caption);
2195 }
2196
2197 if let Some(spans) = &node.style.rich_text_spans {
2198 for span in spans {
2199 add_text_chars_for_font(
2200 chars_by_font,
2201 span.font_family
2202 .as_deref()
2203 .or(node.style.font_family.as_deref()),
2204 span.font_weight
2205 .as_deref()
2206 .or(node.style.font_weight.as_deref()),
2207 span.font_style
2208 .as_deref()
2209 .or(node.style.font_style.as_deref()),
2210 &span.text,
2211 );
2212 }
2213 }
2214
2215 for child in &node.children {
2216 collect_used_chars_from_layout_node(child, chars_by_font);
2217 }
2218}
2219
2220fn collect_used_chars_by_font(layout: &LayoutDom) -> HashMap<String, HashSet<char>> {
2221 let mut chars_by_font = HashMap::new();
2222 for page in &layout.pages {
2223 for node in &page.nodes {
2224 collect_used_chars_from_layout_node(node, &mut chars_by_font);
2225 }
2226 }
2227 chars_by_font
2228}
2229
2230fn simple_font_can_encode_char(font: &ResolvedFont, ch: char) -> bool {
2231 if ch.is_ascii() {
2232 return true;
2233 }
2234 if let Some(encoding) = &font.pdf_encoding {
2235 let Ok(cp) = u16::try_from(ch as u32) else {
2236 return false;
2237 };
2238 return encoding
2239 .code_to_unicode_table()
2240 .into_iter()
2241 .flatten()
2242 .any(|u| u == cp);
2243 }
2244 unicode_to_winansi(ch).is_some()
2245}
2246
2247fn variant_key_base_name(key: &str) -> Option<&str> {
2248 key.strip_suffix("_Bold_Italic")
2249 .or_else(|| key.strip_suffix("_Bold_Normal"))
2250 .or_else(|| key.strip_suffix("_Normal_Italic"))
2251 .or_else(|| key.strip_suffix("_Normal_Normal"))
2252}
2253
2254#[allow(clippy::type_complexity)]
2255fn embed_resolved_fonts(
2256 doc: &mut Document,
2257 resolved: &HashMap<String, ResolvedFont>,
2258 layout: &LayoutDom,
2259) -> (
2260 HashMap<String, String>,
2261 Vec<(String, ObjectId)>,
2262 HashMap<String, FontMetricsData>,
2263) {
2264 let mut font_map = HashMap::new();
2265 let mut font_objects = Vec::new();
2266 let mut metrics_data = HashMap::new();
2267 let used_chars_by_font = collect_used_chars_by_font(layout);
2268 for (idx, (name, font)) in resolved.iter().enumerate() {
2269 let resource_name = format!("XFA_F{}", idx);
2270 let used_chars = used_chars_by_font
2283 .get(name)
2284 .or_else(|| used_chars_by_font.get(&font.name))
2285 .or_else(|| variant_key_base_name(name).and_then(|base| used_chars_by_font.get(base)));
2286 let source_can_encode_all_text = used_chars.is_none_or(|chars| {
2287 chars
2288 .iter()
2289 .all(|ch| simple_font_can_encode_char(font, *ch))
2290 });
2291 let (obj_id, render_font_data) = if let Some(source_font) = font.pdf_source_font {
2292 if source_can_encode_all_text || font.data.is_empty() {
2293 (source_font.object_id, None)
2294 } else {
2295 (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2296 }
2297 } else {
2298 (embed_font_in_pdf(doc, font), Some(font.data.clone()))
2299 };
2300 font_map.insert(name.clone(), format!("/{}", resource_name));
2301 font_objects.push((resource_name, obj_id));
2302 let (_first_char, widths) = font.pdf_glyph_widths();
2303 metrics_data.insert(
2304 name.clone(),
2305 FontMetricsData {
2306 widths,
2307 upem: font.units_per_em,
2308 ascender: font.ascender,
2309 descender: font.descender,
2310 font_data: render_font_data,
2311 face_index: font.face_index,
2312 simple_unicode_to_code: font
2313 .pdf_encoding
2314 .as_ref()
2315 .map(simple_encoding_unicode_to_code_map),
2316 },
2317 );
2318 }
2319 (font_map, font_objects, metrics_data)
2320}
2321
2322fn static_fallback(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
2329 let mut doc = match Document::load_mem(pdf_bytes) {
2330 Ok(d) => d,
2331 Err(e) => {
2332 eprintln!("static_fallback: lopdf load failed ({e}), returning original bytes");
2333 return Ok(pdf_bytes.to_vec());
2334 }
2335 };
2336 strip_widgets_and_acroform(&mut doc);
2337 javascript_policy::strip_javascript_for_flatten(&mut doc);
2338 let mut out = Vec::new();
2339 if let Err(e) = doc.save_to(&mut out) {
2340 eprintln!("static_fallback: save failed ({e}), returning original bytes");
2341 return Ok(pdf_bytes.to_vec());
2342 }
2343 Ok(out)
2344}
2345
2346fn form_dom_page_count(form_xml: &str) -> Option<usize> {
2355 let count = form_xml.matches("<pageArea").count();
2356 if count > 0 {
2357 Some(count)
2358 } else {
2359 None
2360 }
2361}
2362
2363fn apply_form_dom_presence(tree: &mut FormTree, root_id: FormNodeId, form_xml: &str) {
2379 use xfa_layout_engine::form::{FormNodeType, Presence};
2380
2381 let Ok(doc) = roxmltree::Document::parse(form_xml) else {
2382 return;
2383 };
2384
2385 fn clone_subtree(tree: &mut FormTree, src_id: FormNodeId) -> FormNodeId {
2387 let node = tree.get(src_id).clone();
2388 let meta = tree.meta(src_id).clone();
2389 let child_ids: Vec<FormNodeId> = node.children.clone();
2391 let mut new_node = node;
2392 new_node.children = Vec::new();
2393 let mut new_meta = meta;
2395 new_meta.xfa_id = None;
2396 let new_id = tree.add_node_with_meta(new_node, new_meta);
2397 for &child_id in &child_ids {
2399 let cloned_child = clone_subtree(tree, child_id);
2400 tree.get_mut(new_id).children.push(cloned_child);
2401 }
2402 new_id
2403 }
2404
2405 fn clear_field_values_in_subtree(tree: &mut FormTree, root_id: FormNodeId) {
2422 let child_ids: Vec<FormNodeId> = tree.get(root_id).children.clone();
2423 if let FormNodeType::Field { .. } = tree.get(root_id).node_type {
2424 tree.get_mut(root_id).node_type = FormNodeType::Field {
2425 value: String::new(),
2426 };
2427 }
2428 for cid in child_ids {
2429 clear_field_values_in_subtree(tree, cid);
2430 }
2431 }
2432
2433 fn extract_field_value(xml_field: roxmltree::Node<'_, '_>) -> Option<String> {
2436 let value_el = xml_field
2437 .children()
2438 .find(|c| c.is_element() && c.tag_name().name() == "value")?;
2439 let inner = value_el.children().find(|c| c.is_element())?;
2441 inner.text().map(|t| t.to_string())
2442 }
2443
2444 fn apply_recursive(
2447 tree: &mut FormTree,
2448 form_node_id: FormNodeId,
2449 xml_node: roxmltree::Node<'_, '_>,
2450 ) {
2451 let xml_tag = xml_node.tag_name().name();
2452 if xml_tag != "subform" && xml_tag != "field" && xml_tag != "form" {
2453 return;
2454 }
2455
2456 if xml_tag == "subform" || xml_tag == "field" {
2458 if let Some(pres) = xml_node.attribute("presence") {
2459 if pres == "hidden" {
2460 tree.meta_mut(form_node_id).presence = Presence::Hidden;
2461 let som = tree.get(form_node_id).name.clone();
2466 trace_sites::presence(
2467 &som,
2468 TraceReason::PresenceHidden,
2469 "form_dom_presence_hidden",
2470 );
2471 }
2472 }
2473 }
2474
2475 if xml_tag == "field" {
2478 if let Some(val) = extract_field_value(xml_node) {
2479 if let FormNodeType::Field { ref value, .. } = tree.get(form_node_id).node_type {
2480 if value.is_empty() {
2481 tree.get_mut(form_node_id).node_type = FormNodeType::Field { value: val };
2482 }
2483 }
2484 }
2485 return; }
2487
2488 let xml_children: Vec<roxmltree::Node<'_, '_>> = xml_node
2490 .children()
2491 .filter(|c| {
2492 c.is_element()
2493 && (c.tag_name().name() == "subform"
2494 || c.tag_name().name() == "field"
2495 || c.tag_name().name() == "draw")
2496 })
2497 .collect();
2498
2499 let mut xml_groups: Vec<(&str, Vec<roxmltree::Node<'_, '_>>)> = Vec::new();
2502 for &xc in &xml_children {
2503 let xname = xc.attribute("name").unwrap_or("");
2504 if let Some(last) = xml_groups.last_mut() {
2505 if last.0 == xname {
2506 last.1.push(xc);
2507 continue;
2508 }
2509 }
2510 xml_groups.push((xname, vec![xc]));
2511 }
2512
2513 let mut form_children = tree.get(form_node_id).children.clone();
2515 let mut used = vec![false; form_children.len()];
2516
2517 for (gname, group_xml_nodes) in &xml_groups {
2518 let xml_count = group_xml_nodes.len();
2519
2520 let existing: Vec<(usize, FormNodeId)> = form_children
2522 .iter()
2523 .enumerate()
2524 .filter(|(i, &fid)| !used[*i] && tree.get(fid).name == *gname)
2525 .map(|(i, &fid)| (i, fid))
2526 .collect();
2527 let existing_count = existing.len();
2528
2529 let replication = crate::adobe_compat::form_dom_driven_repeat_instance_replication(
2536 gname,
2537 xml_count,
2538 existing_count,
2539 );
2540 if replication.clones_to_add > 0 {
2541 let template_id = existing[0].1;
2542 let last_existing_idx = existing.last().unwrap().0;
2544 let insert_pos = last_existing_idx + 1;
2545 let clones_needed = replication.clones_to_add;
2546 for (_idx, fid) in &existing {
2557 clear_field_values_in_subtree(tree, *fid);
2558 }
2559 let mut new_ids = Vec::new();
2560 for _ in 0..clones_needed {
2561 let cloned = clone_subtree(tree, template_id);
2562 clear_field_values_in_subtree(tree, cloned);
2567 new_ids.push(cloned);
2568 }
2569 for (offset, new_id) in new_ids.iter().enumerate() {
2571 form_children.insert(insert_pos + offset, *new_id);
2572 used.insert(insert_pos + offset, false);
2573 }
2574 tree.get_mut(form_node_id).children = form_children.clone();
2576 }
2582
2583 for (group_idx, &xc) in group_xml_nodes.iter().enumerate() {
2585 let matched = form_children
2587 .iter()
2588 .enumerate()
2589 .skip(if group_idx > 0 {
2590 form_children
2592 .iter()
2593 .enumerate()
2594 .rfind(|(i, &fid)| used[*i] && tree.get(fid).name == *gname)
2595 .map(|(i, _)| i + 1)
2596 .unwrap_or(0)
2597 } else {
2598 0
2599 })
2600 .find(|(i, &fid)| !used[*i] && tree.get(fid).name == *gname);
2601 if let Some((idx, &fid)) = matched {
2602 used[idx] = true;
2603 apply_recursive(tree, fid, xc);
2604 }
2605 }
2606 }
2607
2608 let has_subform_children = xml_children
2618 .iter()
2619 .any(|c| c.tag_name().name() == "subform");
2620 if has_subform_children {
2621 for (i, &fid) in form_children.iter().enumerate() {
2622 if used[i] {
2623 continue;
2624 }
2625 let child_node = tree.get(fid);
2626 if matches!(child_node.node_type, FormNodeType::Subform)
2629 && !child_node.name.is_empty()
2630 {
2631 let som = child_node.name.clone();
2632 tree.meta_mut(fid).presence = Presence::Hidden;
2633 trace_sites::presence(
2638 &som,
2639 TraceReason::PresenceHidden,
2640 "form_dom_absent_subform_hidden",
2641 );
2642 }
2643 }
2644 }
2645 }
2646
2647 let form_root = doc.root_element();
2649 let form_root_subform = form_root
2650 .children()
2651 .find(|c| c.is_element() && c.tag_name().name() == "subform");
2652
2653 if let Some(xml_root_sf) = form_root_subform {
2654 let root_children = tree.get(root_id).children.clone();
2655 let root_name = xml_root_sf.attribute("name").unwrap_or("");
2656 for &child_id in &root_children {
2657 if tree.get(child_id).name == root_name {
2658 apply_recursive(tree, child_id, xml_root_sf);
2659 break;
2660 }
2661 }
2662 }
2663}
2664
2665fn is_corrupt_xfa_template(pdf_size: usize, template_xml: &str) -> bool {
2669 if pdf_size >= 1024 {
2671 return false;
2672 }
2673 match roxmltree::Document::parse(template_xml) {
2675 Ok(doc) => {
2676 let root = doc.root_element();
2677 !root.children().any(|c| {
2678 c.is_element()
2679 && matches!(c.tag_name().name(), "subform" | "pageSet" | "subformSet")
2680 })
2681 }
2682 Err(_) => true, }
2684}
2685
2686fn strip_undefined_xml_entities(xml: &str) -> String {
2703 let predefined = ["lt", "gt", "amp", "quot", "apos"];
2704 let mut result = String::with_capacity(xml.len());
2705 let bytes = xml.as_bytes();
2706 let mut pos = 0;
2707
2708 while let Some(rel_amp_pos) = xml[pos..].find('&') {
2709 let amp_pos = pos + rel_amp_pos;
2710 result.push_str(&xml[pos..amp_pos]);
2711
2712 if let Some((entity_name, next_pos)) = parse_xml_entity_reference(xml, amp_pos) {
2713 if entity_name.starts_with('#') || predefined.contains(&entity_name) {
2717 result.push_str(&xml[amp_pos..next_pos]);
2718 }
2719 pos = next_pos;
2720 } else {
2721 result.push('&');
2723 pos = amp_pos + 1;
2724 }
2725 }
2726
2727 if pos < bytes.len() {
2728 result.push_str(&xml[pos..]);
2729 }
2730 result
2731}
2732
2733fn parse_xml_entity_reference(xml: &str, amp_pos: usize) -> Option<(&str, usize)> {
2734 let bytes = xml.as_bytes();
2735 let start = amp_pos + 1;
2736 let first = *bytes.get(start)?;
2737
2738 if first == b'#' {
2740 let mut idx = start + 1;
2741 if matches!(bytes.get(idx), Some(b'x' | b'X')) {
2742 idx += 1;
2743 let hex_start = idx;
2744 while matches!(
2745 bytes.get(idx),
2746 Some(b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')
2747 ) {
2748 idx += 1;
2749 }
2750 if idx == hex_start || !matches!(bytes.get(idx), Some(b';')) {
2751 return None;
2752 }
2753 } else {
2754 let digits_start = idx;
2755 while matches!(bytes.get(idx), Some(b'0'..=b'9')) {
2756 idx += 1;
2757 }
2758 if idx == digits_start || !matches!(bytes.get(idx), Some(b';')) {
2759 return None;
2760 }
2761 }
2762 return Some((&xml[start..idx], idx + 1));
2763 }
2764
2765 if !is_xml_name_start(first) {
2768 return None;
2769 }
2770
2771 let mut idx = start + 1;
2772 while let Some(&b) = bytes.get(idx) {
2773 if b == b';' {
2774 return Some((&xml[start..idx], idx + 1));
2775 }
2776 if !is_xml_name_char(b) {
2777 return None;
2778 }
2779 idx += 1;
2780 }
2781 None
2782}
2783
2784fn is_xml_name_start(byte: u8) -> bool {
2785 matches!(byte, b':' | b'_' | b'A'..=b'Z' | b'a'..=b'z')
2786}
2787
2788fn is_xml_name_char(byte: u8) -> bool {
2789 is_xml_name_start(byte) || matches!(byte, b'-' | b'.' | b'0'..=b'9')
2790}
2791
2792fn pages_have_static_content(doc: &Document) -> bool {
2805 for page_id in doc.page_iter() {
2806 let streams = page_content_streams(doc, page_id);
2807 if streams.is_empty() {
2808 continue;
2809 }
2810
2811 let mut text_op_count = 0usize;
2817 for stream in &streams {
2818 if is_xfa_placeholder_stream(stream) || is_watermark_stream(stream) {
2819 continue;
2820 }
2821 text_op_count += count_text_operators(stream);
2822 }
2823
2824 if text_op_count >= 5 {
2825 return true;
2826 }
2827 }
2828 false
2829}
2830
2831fn page_content_streams(doc: &Document, page_id: ObjectId) -> Vec<Vec<u8>> {
2832 let Ok(page_dict) = doc.get_dictionary(page_id) else {
2833 return Vec::new();
2834 };
2835
2836 match page_dict.get(b"Contents") {
2837 Ok(Object::Array(arr)) => arr
2838 .iter()
2839 .filter_map(|object| resolve_stream_content(doc, object))
2840 .collect(),
2841 Ok(Object::Reference(id)) => match doc.get_object(*id) {
2842 Ok(Object::Array(arr)) => arr
2843 .iter()
2844 .filter_map(|object| resolve_stream_content(doc, object))
2845 .collect(),
2846 Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
2847 Err(_) => Vec::new(),
2848 },
2849 Ok(object) => resolve_stream_content(doc, object).into_iter().collect(),
2850 Err(_) => Vec::new(),
2851 }
2852}
2853
2854fn resolve_stream_content(doc: &Document, object: &Object) -> Option<Vec<u8>> {
2855 let stream = match object {
2856 Object::Reference(id) => doc.get_object(*id).ok()?.as_stream().ok()?,
2857 Object::Stream(stream) => stream,
2858 _ => return None,
2859 };
2860
2861 stream
2862 .get_plain_content()
2863 .ok()
2864 .or_else(|| Some(stream.content.clone()))
2865}
2866
2867fn count_text_operators(stream: &[u8]) -> usize {
2869 let mut count = 0;
2870 for window in stream.windows(3) {
2871 if (window[0] == b' ' || window[0] == b')' || window[0] == b']')
2872 && window[1] == b'T'
2873 && (window[2] == b'j' || window[2] == b'J')
2874 {
2875 count += 1;
2876 }
2877 }
2878 count
2879}
2880
2881fn bake_checkbox_radio_ap_marks(doc: &mut Document, page_id: ObjectId) -> usize {
2891 let annots = page_annotations(doc, page_id);
2892 if annots.is_empty() {
2893 return 0;
2894 }
2895
2896 let mut baked = 0usize;
2897 let mut overlay_ops = Vec::new();
2898
2899 for annot in &annots {
2900 let Some(annot_id) = annot.as_reference().ok() else {
2901 continue;
2902 };
2903 let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
2904 continue;
2905 };
2906
2907 let is_widget = annot_dict
2908 .get(b"Subtype")
2909 .ok()
2910 .and_then(|obj| obj.as_name().ok())
2911 == Some(&b"Widget"[..]);
2912 if !is_widget {
2913 continue;
2914 }
2915
2916 let ap = match annot_dict.get(b"AP").ok().and_then(|o| o.as_dict().ok()) {
2920 Some(ap) => ap.clone(),
2921 None => continue,
2922 };
2923 let normal_obj = match ap.get(b"N").ok() {
2924 Some(obj) => obj.clone(),
2925 None => continue,
2926 };
2927
2928 let states: Dictionary = match &normal_obj {
2930 Object::Reference(id) => match doc.get_object(*id).ok().cloned() {
2931 Some(Object::Dictionary(d)) => d,
2932 _ => continue, },
2934 Object::Dictionary(d) => d.clone(),
2935 _ => continue,
2936 };
2937
2938 if matches!(selected_widget_state(&annot_dict), Some(state) if state == b"Off") {
2939 continue;
2940 }
2941
2942 let on_id = states
2944 .iter()
2945 .filter(|(name, _)| name.as_slice() != b"Off")
2946 .find_map(|(_, obj)| match obj {
2947 Object::Reference(id) => Some(*id),
2948 _ => None,
2949 });
2950 let Some(ap_id) = on_id else { continue };
2951
2952 match doc.get_object(ap_id).ok() {
2954 Some(Object::Stream(_)) => {}
2955 _ => continue,
2956 }
2957
2958 let Some(rect) = annotation_rect(&annot_dict) else {
2959 continue;
2960 };
2961
2962 let xobject_name = format!("XfaCbAp{}", baked);
2963 add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
2964 write_ops(
2965 &mut overlay_ops,
2966 format_args!(
2967 "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
2968 rect[0], rect[1], xobject_name
2969 ),
2970 );
2971 baked += 1;
2972 }
2973
2974 if !overlay_ops.is_empty() {
2975 append_to_page_content(doc, page_id, &overlay_ops);
2976 }
2977
2978 baked
2979}
2980
2981fn is_xfa_placeholder_stream(stream: &[u8]) -> bool {
2982 const PLACEHOLDER_MARKERS: [&[u8]; 5] = [
2983 b"Please wait",
2984 b"Adobe Reader",
2985 b"reader_download",
2986 b"display this type of document",
2987 b"To view the full contents",
2988 ];
2989
2990 PLACEHOLDER_MARKERS
2991 .iter()
2992 .any(|marker| contains_ascii_case_insensitive(stream, marker))
2993}
2994
2995fn is_watermark_stream(stream: &[u8]) -> bool {
2999 const WATERMARK_MARKERS: [&[u8]; 3] =
3000 [b"Evaluation Only", b"Qoppa Software", b"For Evaluation"];
3001 WATERMARK_MARKERS
3002 .iter()
3003 .any(|marker| contains_ascii_case_insensitive(stream, marker))
3004}
3005
3006fn contains_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
3007 haystack
3008 .windows(needle.len())
3009 .any(|window| window.eq_ignore_ascii_case(needle))
3010}
3011
3012fn write_ops(buf: &mut Vec<u8>, args: std::fmt::Arguments<'_>) {
3013 use std::fmt::Write as _;
3014
3015 let mut text = String::new();
3016 let _ = text.write_fmt(args);
3017 buf.extend_from_slice(text.as_bytes());
3018}
3019
3020fn flatten_widget_appearances(doc: &mut Document) -> usize {
3028 let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3029 let mut flattened = 0usize;
3030
3031 for page_id in page_ids {
3032 let annots = page_annotations(doc, page_id);
3033 if annots.is_empty() {
3034 continue;
3035 }
3036
3037 let mut retained = Vec::new();
3038 let mut overlay_ops = Vec::new();
3039
3040 for annot in annots {
3041 let Some(annot_id) = annot.as_reference().ok() else {
3042 retained.push(annot);
3043 continue;
3044 };
3045
3046 let Ok(annot_dict) = doc.get_dictionary(annot_id).cloned() else {
3047 retained.push(annot);
3048 continue;
3049 };
3050
3051 let is_widget = annot_dict
3052 .get(b"Subtype")
3053 .ok()
3054 .and_then(|obj| obj.as_name().ok())
3055 == Some(&b"Widget"[..]);
3056 if !is_widget {
3057 retained.push(annot);
3058 continue;
3059 }
3060
3061 let Some(rect) = annotation_rect(&annot_dict) else {
3062 retained.push(Object::Reference(annot_id));
3063 continue;
3064 };
3065 let Some(ap_id) = resolve_widget_normal_appearance(doc, &annot_dict) else {
3066 retained.push(Object::Reference(annot_id));
3067 continue;
3068 };
3069
3070 let xobject_name = format!("XfaAp{}", flattened);
3071 add_xobject_to_page_resources(doc, page_id, &xobject_name, ap_id);
3072 write_ops(
3073 &mut overlay_ops,
3074 format_args!(
3075 "q 1 0 0 1 {:.3} {:.3} cm /{} Do Q\n",
3076 rect[0], rect[1], xobject_name
3077 ),
3078 );
3079 flattened += 1;
3080 }
3081
3082 if overlay_ops.is_empty() {
3083 continue;
3084 }
3085
3086 append_to_page_content(doc, page_id, &overlay_ops);
3087 set_page_annotations(doc, page_id, retained);
3088 }
3089
3090 flattened
3091}
3092
3093fn strip_widget_annotations(doc: &mut Document, page_id: ObjectId) {
3095 let annots = page_annotations(doc, page_id);
3096 if annots.is_empty() {
3097 return;
3098 }
3099 let mut retained = Vec::new();
3100 for annot in &annots {
3101 let is_widget = annot
3102 .as_reference()
3103 .ok()
3104 .and_then(|id| doc.get_dictionary(id).ok())
3105 .and_then(|d| d.get(b"Subtype").ok())
3106 .and_then(|obj| obj.as_name().ok())
3107 == Some(&b"Widget"[..]);
3108 if !is_widget {
3109 retained.push(annot.clone());
3110 }
3111 }
3112 set_page_annotations(doc, page_id, retained);
3113}
3114
3115fn page_annotations(doc: &Document, page_id: ObjectId) -> Vec<Object> {
3116 let Ok(page_dict) = doc.get_dictionary(page_id) else {
3117 return Vec::new();
3118 };
3119
3120 match page_dict.get(b"Annots") {
3121 Ok(Object::Array(arr)) => arr.clone(),
3122 Ok(Object::Reference(id)) => doc
3123 .get_object(*id)
3124 .ok()
3125 .and_then(|obj| obj.as_array().ok().cloned())
3126 .unwrap_or_default(),
3127 _ => Vec::new(),
3128 }
3129}
3130
3131fn set_page_annotations(doc: &mut Document, page_id: ObjectId, annots: Vec<Object>) {
3132 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3133 if annots.is_empty() {
3134 page_dict.remove(b"Annots");
3135 } else {
3136 page_dict.set("Annots", Object::Array(annots));
3137 }
3138 }
3139}
3140
3141fn annotation_rect(dict: &Dictionary) -> Option<[f32; 4]> {
3142 let rect = dict.get(b"Rect").ok()?.as_array().ok()?;
3143 if rect.len() != 4 {
3144 return None;
3145 }
3146 Some([
3147 rect[0].as_float().ok()?,
3148 rect[1].as_float().ok()?,
3149 rect[2].as_float().ok()?,
3150 rect[3].as_float().ok()?,
3151 ])
3152}
3153
3154fn resolve_widget_normal_appearance(
3155 doc: &mut Document,
3156 annot_dict: &Dictionary,
3157) -> Option<ObjectId> {
3158 let ap = annot_dict.get(b"AP").ok()?.as_dict().ok()?;
3159 let normal = ap.get(b"N").ok()?;
3160 resolve_appearance_object(doc, annot_dict, normal)
3161}
3162
3163fn resolve_appearance_object(
3164 doc: &mut Document,
3165 annot_dict: &Dictionary,
3166 object: &Object,
3167) -> Option<ObjectId> {
3168 match object {
3169 Object::Reference(id) => match doc.get_object(*id).ok()?.clone() {
3170 Object::Stream(_) => Some(*id),
3171 Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, &states),
3172 _ => None,
3173 },
3174 Object::Stream(stream) => Some(doc.add_object(Object::Stream(stream.clone()))),
3175 Object::Dictionary(states) => resolve_appearance_state(doc, annot_dict, states),
3176 _ => None,
3177 }
3178}
3179
3180fn resolve_appearance_state(
3181 doc: &mut Document,
3182 annot_dict: &Dictionary,
3183 states: &Dictionary,
3184) -> Option<ObjectId> {
3185 if let Some(state) = selected_widget_state(annot_dict) {
3186 if let Ok(object) = states.get(state) {
3187 if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3188 return Some(id);
3189 }
3190 }
3191 if state == b"Off" {
3192 return None;
3195 }
3196 }
3197
3198 for fallback in [b"Yes".as_slice(), b"On".as_slice(), b"Off".as_slice()] {
3199 if let Ok(object) = states.get(fallback) {
3200 if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3201 return Some(id);
3202 }
3203 }
3204 }
3205
3206 for (_name, object) in states.iter() {
3207 if let Some(id) = resolve_appearance_object(doc, annot_dict, object) {
3208 return Some(id);
3209 }
3210 }
3211
3212 None
3213}
3214
3215fn selected_widget_state(annot_dict: &Dictionary) -> Option<&[u8]> {
3216 annot_dict
3217 .get(b"AS")
3218 .ok()
3219 .and_then(|obj| obj.as_name().ok())
3220 .or_else(|| annot_dict.get(b"V").ok().and_then(|obj| obj.as_name().ok()))
3221}
3222
3223fn add_xobject_to_page_resources(
3224 doc: &mut Document,
3225 page_id: ObjectId,
3226 name: &str,
3227 xobject_id: ObjectId,
3228) {
3229 let resources_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3230 page_dict
3231 .get(b"Resources")
3232 .ok()
3233 .and_then(|obj| obj.as_reference().ok())
3234 });
3235
3236 if let Some(resources_id) = resources_ref {
3237 let xobject_ref = doc.get_dictionary(resources_id).ok().and_then(|resources| {
3238 resources
3239 .get(b"XObject")
3240 .ok()
3241 .and_then(|obj| obj.as_reference().ok())
3242 });
3243
3244 if let Some(xobject_dict_id) = xobject_ref {
3245 if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3246 xobjects.set(name, Object::Reference(xobject_id));
3247 return;
3248 }
3249 }
3250
3251 if let Ok(Object::Dictionary(ref mut resources)) = doc.get_object_mut(resources_id) {
3252 add_xobject_to_resources_dict(resources, name, xobject_id);
3253 return;
3254 }
3255 }
3256
3257 let inline_xobject_ref = doc.get_dictionary(page_id).ok().and_then(|page_dict| {
3258 page_dict
3259 .get(b"Resources")
3260 .ok()
3261 .and_then(|obj| obj.as_dict().ok())
3262 .and_then(|resources| {
3263 resources
3264 .get(b"XObject")
3265 .ok()
3266 .and_then(|obj| obj.as_reference().ok())
3267 })
3268 });
3269
3270 if let Some(xobject_dict_id) = inline_xobject_ref {
3271 if let Ok(Object::Dictionary(ref mut xobjects)) = doc.get_object_mut(xobject_dict_id) {
3272 xobjects.set(name, Object::Reference(xobject_id));
3273 return;
3274 }
3275 }
3276
3277 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3278 if let Ok(Object::Dictionary(ref mut resources)) = page_dict.get_mut(b"Resources") {
3279 add_xobject_to_resources_dict(resources, name, xobject_id);
3280 return;
3281 }
3282
3283 let mut resources = Dictionary::new();
3284 add_xobject_to_resources_dict(&mut resources, name, xobject_id);
3285 page_dict.set("Resources", Object::Dictionary(resources));
3286 }
3287}
3288
3289fn add_xobject_to_resources_dict(resources: &mut Dictionary, name: &str, xobject_id: ObjectId) {
3290 if let Ok(Object::Dictionary(ref mut xobjects)) = resources.get_mut(b"XObject") {
3291 xobjects.set(name, Object::Reference(xobject_id));
3292 } else {
3293 let mut xobjects = Dictionary::new();
3294 xobjects.set(name, Object::Reference(xobject_id));
3295 resources.set("XObject", Object::Dictionary(xobjects));
3296 }
3297}
3298
3299fn append_to_page_content(doc: &mut Document, page_id: ObjectId, data: &[u8]) {
3300 let new_stream_id = doc.add_object(Object::Stream(Stream::new(dictionary! {}, data.to_vec())));
3301
3302 let contents = doc
3303 .get_dictionary(page_id)
3304 .ok()
3305 .and_then(|page_dict| page_dict.get(b"Contents").ok().cloned());
3306
3307 let new_contents = match contents {
3314 Some(existing) => {
3315 let mut flattened = Vec::new();
3316 flatten_page_contents_entries(doc, existing, &mut flattened);
3317 flattened.push(Object::Reference(new_stream_id));
3318 if flattened.len() == 1 {
3319 flattened.pop().unwrap()
3320 } else {
3321 Object::Array(flattened)
3322 }
3323 }
3324 None => Object::Reference(new_stream_id),
3325 };
3326
3327 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3328 page_dict.set("Contents", new_contents);
3329 }
3330}
3331
3332fn flatten_page_contents_entries(doc: &mut Document, object: Object, out: &mut Vec<Object>) {
3333 match object {
3334 Object::Reference(id) => match doc.get_object(id).cloned() {
3335 Ok(Object::Array(items)) => {
3336 for item in items {
3337 flatten_page_contents_entries(doc, item, out);
3338 }
3339 }
3340 _ => out.push(Object::Reference(id)),
3341 },
3342 Object::Array(items) => {
3343 for item in items {
3344 flatten_page_contents_entries(doc, item, out);
3345 }
3346 }
3347 Object::Stream(stream) => {
3348 let stream_id = doc.add_object(Object::Stream(stream));
3349 out.push(Object::Reference(stream_id));
3350 }
3351 other => out.push(other),
3352 }
3353}
3354
3355fn strip_widgets_and_acroform(doc: &mut Document) {
3361 remove_acroform(doc);
3362}
3363
3364fn write_page_content(
3366 doc: &mut Document,
3367 page_id: ObjectId,
3368 overlay: &PageOverlay,
3369 font_ids: &[ObjectId; 3],
3370 embedded_fonts: &[(String, ObjectId)],
3371 page_width: Option<f64>,
3372 page_height: Option<f64>,
3373) -> Result<()> {
3374 let mut resources = make_resources_dict(font_ids, embedded_fonts);
3375
3376 let mut xobjects = Dictionary::new();
3377 for img in &overlay.images {
3378 match embed_image(doc, &img.data, &img.mime_type) {
3379 Ok(result) => {
3380 xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3381 }
3382 Err(e) => {
3383 eprintln!("failed to embed image {}: {}", img.name, e);
3384 }
3385 }
3386 }
3387 if !xobjects.is_empty() {
3388 resources.set("XObject", Object::Dictionary(xobjects));
3389 }
3390
3391 let stream = Stream::new(
3392 dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
3393 overlay.content_stream.clone(),
3394 );
3395 let stream_id = doc.add_object(Object::Stream(stream));
3396
3397 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3398 page_dict.set("Contents", Object::Reference(stream_id));
3399 page_dict.set("Resources", Object::Dictionary(resources));
3400 if let (Some(w), Some(h)) = (page_width, page_height) {
3404 page_dict.set(
3405 "MediaBox",
3406 Object::Array(vec![
3407 Object::Real(0.0),
3408 Object::Real(0.0),
3409 Object::Real(w as f32),
3410 Object::Real(h as f32),
3411 ]),
3412 );
3413 }
3414 }
3415 Ok(())
3416}
3417
3418fn overlay_page_content(
3424 doc: &mut Document,
3425 page_id: ObjectId,
3426 overlay: &PageOverlay,
3427 font_ids: &[ObjectId; 3],
3428 embedded_fonts: &[(String, ObjectId)],
3429) -> Result<()> {
3430 let xfa_resources = make_resources_dict(font_ids, embedded_fonts);
3431
3432 let mut xfa_xobjects = Dictionary::new();
3433 for img in &overlay.images {
3434 match embed_image(doc, &img.data, &img.mime_type) {
3435 Ok(result) => {
3436 xfa_xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3437 }
3438 Err(e) => {
3439 eprintln!("failed to embed image {}: {}", img.name, e);
3440 }
3441 }
3442 }
3443
3444 merge_xfa_resources_into_page(doc, page_id, &xfa_resources, &xfa_xobjects);
3445
3446 if !overlay.content_stream.is_empty() {
3447 append_to_page_content(doc, page_id, &overlay.content_stream);
3448 }
3449
3450 Ok(())
3451}
3452
3453fn merge_xfa_resources_into_page(
3456 doc: &mut Document,
3457 page_id: ObjectId,
3458 xfa_resources: &Dictionary,
3459 xfa_xobjects: &Dictionary,
3460) {
3461 let existing_resources = doc
3462 .get_dictionary(page_id)
3463 .ok()
3464 .and_then(|page_dict| {
3465 page_dict.get(b"Resources").ok().and_then(|obj| match obj {
3466 Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
3467 Object::Dictionary(d) => Some(d.clone()),
3468 _ => None,
3469 })
3470 })
3471 .unwrap_or_default();
3472
3473 let mut merged = existing_resources;
3474
3475 if let Ok(xfa_font_dict) = xfa_resources.get(b"Font").and_then(|o| o.as_dict()) {
3478 let existing_font = merged
3479 .get(b"Font")
3480 .ok()
3481 .and_then(|obj| match obj {
3482 Object::Dictionary(d) => Some(d.clone()),
3483 Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
3484 _ => None,
3485 })
3486 .unwrap_or_default();
3487
3488 let mut font_merged = existing_font;
3489 for (key, val) in xfa_font_dict.iter() {
3490 if font_merged.get(key).is_err() {
3491 font_merged.set(key.clone(), val.clone());
3492 }
3493 }
3494 merged.set("Font", Object::Dictionary(font_merged));
3495 }
3496
3497 if !xfa_xobjects.is_empty() {
3499 let existing_xobj = merged
3500 .get(b"XObject")
3501 .ok()
3502 .and_then(|obj| match obj {
3503 Object::Dictionary(d) => Some(d.clone()),
3504 Object::Reference(id) => doc.get_dictionary(*id).ok().cloned(),
3505 _ => None,
3506 })
3507 .unwrap_or_default();
3508
3509 let mut xobj_merged = existing_xobj;
3510 for (key, val) in xfa_xobjects.iter() {
3511 xobj_merged.set(key.clone(), val.clone());
3512 }
3513 merged.set("XObject", Object::Dictionary(xobj_merged));
3514 }
3515
3516 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
3517 page_dict.set("Resources", Object::Dictionary(merged));
3518 }
3519}
3520
3521fn add_new_page(
3523 doc: &mut Document,
3524 w: f64,
3525 h: f64,
3526 overlay: &PageOverlay,
3527 font_ids: &[ObjectId; 3],
3528 embedded_fonts: &[(String, ObjectId)],
3529) -> Result<()> {
3530 let mut resources = make_resources_dict(font_ids, embedded_fonts);
3531
3532 let mut xobjects = Dictionary::new();
3533 for img in &overlay.images {
3534 match embed_image(doc, &img.data, &img.mime_type) {
3535 Ok(result) => {
3536 xobjects.set(img.name.as_str(), Object::Reference(result.object_id));
3537 }
3538 Err(e) => {
3539 eprintln!("failed to embed image {}: {}", img.name, e);
3540 }
3541 }
3542 }
3543 if !xobjects.is_empty() {
3544 resources.set("XObject", Object::Dictionary(xobjects));
3545 }
3546
3547 let stream = Stream::new(
3548 dictionary! { "Length" => Object::Integer(overlay.content_stream.len() as i64) },
3549 overlay.content_stream.clone(),
3550 );
3551 let stream_id = doc.add_object(Object::Stream(stream));
3552
3553 let pages_id = find_pages_root(doc)?;
3555
3556 let page_id = doc.add_object(Object::Dictionary(dictionary! {
3557 "Type" => Object::Name(b"Page".to_vec()),
3558 "Parent" => Object::Reference(pages_id),
3559 "MediaBox" => Object::Array(vec![
3560 Object::Integer(0), Object::Integer(0),
3561 Object::Real(w as f32), Object::Real(h as f32),
3562 ]),
3563 "Contents" => Object::Reference(stream_id),
3564 "Resources" => Object::Dictionary(resources)
3565 }));
3566
3567 if let Ok(Object::Dictionary(ref mut pages_dict)) = doc.get_object_mut(pages_id) {
3569 if let Ok(Object::Array(ref mut kids)) = pages_dict.get_mut(b"Kids") {
3570 kids.push(Object::Reference(page_id));
3571 }
3572 if let Ok(Object::Integer(ref mut count)) = pages_dict.get_mut(b"Count") {
3573 *count += 1;
3574 }
3575 }
3576 Ok(())
3577}
3578
3579fn make_resources_dict(
3580 font_ids: &[ObjectId; 3],
3581 embedded_fonts: &[(String, ObjectId)],
3582) -> Dictionary {
3583 let mut fonts = Dictionary::new();
3584 fonts.set("F1", Object::Reference(font_ids[0]));
3585 fonts.set("F2", Object::Reference(font_ids[1]));
3586 fonts.set("F3", Object::Reference(font_ids[2]));
3587 for (name, obj_id) in embedded_fonts {
3588 fonts.set(name.as_str(), Object::Reference(*obj_id));
3589 }
3590 let mut resources = Dictionary::new();
3591 resources.set("Font", Object::Dictionary(fonts));
3592 resources
3593}
3594
3595fn find_pages_root(doc: &Document) -> Result<ObjectId> {
3596 let root_id = doc
3597 .trailer
3598 .get(b"Root")
3599 .ok()
3600 .and_then(|o: &Object| o.as_reference().ok())
3601 .ok_or_else(|| XfaError::LoadFailed("no /Root in trailer".to_string()))?;
3602 let catalog = doc
3603 .get_dictionary(root_id)
3604 .map_err(|e| XfaError::LoadFailed(format!("catalog: {e}")))?;
3605 catalog
3606 .get(b"Pages")
3607 .ok()
3608 .and_then(|o: &Object| o.as_reference().ok())
3609 .ok_or_else(|| XfaError::LoadFailed("no /Pages in catalog".to_string()))
3610}
3611
3612fn remove_acroform(doc: &mut Document) {
3626 let root_id = match doc.trailer.get(b"Root") {
3627 Ok(Object::Reference(id)) => *id,
3628 _ => return,
3629 };
3630
3631 let acroform_id: Option<ObjectId> = {
3634 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
3635 let acroform_ref = dict.get(b"AcroForm").ok().and_then(|o| {
3636 if let Object::Reference(id) = o {
3637 Some(*id)
3638 } else {
3639 None
3640 }
3641 });
3642 dict.remove(b"AcroForm");
3643 dict.remove(b"NeedsRendering");
3644 acroform_ref
3645 } else {
3646 None
3647 }
3648 };
3649
3650 let xfa_stream_ids: Vec<ObjectId> = acroform_id
3653 .and_then(|af_id| doc.get_dictionary(af_id).ok())
3654 .map(|af_dict| match af_dict.get(b"XFA") {
3655 Ok(Object::Array(arr)) => arr
3656 .iter()
3657 .filter_map(|o| {
3658 if let Object::Reference(id) = o {
3659 Some(*id)
3660 } else {
3661 None
3662 }
3663 })
3664 .collect(),
3665 Ok(Object::Reference(id)) => vec![*id],
3666 _ => Vec::new(),
3667 })
3668 .unwrap_or_default();
3669
3670 if let Some(af_id) = acroform_id {
3671 if let Ok(Object::Dictionary(ref mut af_dict)) = doc.get_object_mut(af_id) {
3672 af_dict.remove(b"XFA");
3673 }
3674 }
3675
3676 for stream_id in xfa_stream_ids {
3680 doc.objects.remove(&stream_id);
3681 }
3682 if let Some(af_id) = acroform_id {
3683 doc.objects.remove(&af_id);
3684 }
3685
3686 let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3689 for page_id in page_ids {
3690 strip_widget_annotations(doc, page_id);
3691 }
3692}
3693
3694pub struct FlattenValidation {
3703 pub has_no_xfa: bool,
3705 pub has_no_needs_rendering: bool,
3707 pub has_no_acroform: bool,
3709 pub page_count: usize,
3711 pub warnings: Vec<String>,
3713}
3714
3715pub fn validate_flattened_pdf(pdf_bytes: &[u8]) -> Result<FlattenValidation> {
3723 if pdf_bytes.is_empty() {
3724 return Ok(FlattenValidation {
3725 has_no_xfa: true,
3726 has_no_needs_rendering: true,
3727 has_no_acroform: true,
3728 page_count: 0,
3729 warnings: vec!["empty input — no PDF to validate".into()],
3730 });
3731 }
3732
3733 let doc = match Document::load_mem(pdf_bytes) {
3734 Ok(d) => d,
3735 Err(e) => {
3736 return Ok(FlattenValidation {
3737 has_no_xfa: false,
3738 has_no_needs_rendering: false,
3739 has_no_acroform: false,
3740 page_count: 0,
3741 warnings: vec![format!("could not parse PDF: {e}")],
3742 });
3743 }
3744 };
3745
3746 let mut warnings = Vec::new();
3747 let mut has_no_xfa = true;
3748 let mut has_no_needs_rendering = true;
3749 let mut has_no_acroform = true;
3750
3751 let root_id = doc.trailer.get(b"Root").ok().and_then(|o| {
3753 if let Object::Reference(id) = o {
3754 Some(*id)
3755 } else {
3756 None
3757 }
3758 });
3759
3760 if let Some(rid) = root_id {
3761 if let Ok(catalog) = doc.get_dictionary(rid) {
3762 if catalog.get(b"AcroForm").is_ok() {
3763 has_no_acroform = false;
3764 warnings.push("/AcroForm still present in catalog".into());
3765
3766 let acroform_has_xfa = catalog
3768 .get(b"AcroForm")
3769 .ok()
3770 .and_then(|o| match o {
3771 Object::Reference(id) => doc.get_dictionary(*id).ok(),
3772 Object::Dictionary(d) => Some(d),
3773 _ => None,
3774 })
3775 .map(|d| d.get(b"XFA").is_ok())
3776 .unwrap_or(false);
3777
3778 if acroform_has_xfa {
3779 has_no_xfa = false;
3780 warnings.push("/XFA still present in AcroForm dictionary".into());
3781 }
3782 }
3783
3784 if catalog.get(b"NeedsRendering").is_ok() {
3785 has_no_needs_rendering = false;
3786 warnings.push("/NeedsRendering still present in catalog".into());
3787 }
3788
3789 if catalog.get(b"XFA").is_ok() {
3791 has_no_xfa = false;
3792 warnings.push("/XFA still present directly in catalog".into());
3793 }
3794 }
3795 }
3796
3797 let page_ids: Vec<ObjectId> = doc.page_iter().collect();
3799 let page_count = page_ids.len();
3800 for page_id in page_ids {
3801 for annot_obj in page_annotations(&doc, page_id) {
3802 let is_widget = annot_obj
3803 .as_reference()
3804 .ok()
3805 .and_then(|id| doc.get_dictionary(id).ok())
3806 .and_then(|d| {
3807 d.get(b"Subtype")
3808 .ok()
3809 .map(|st| st == &Object::Name(b"Widget".to_vec()))
3810 })
3811 .unwrap_or(false);
3812 if is_widget {
3813 warnings.push(format!(
3814 "widget annotation found on page (object {:?})",
3815 annot_obj
3816 ));
3817 }
3818 }
3819 }
3820
3821 Ok(FlattenValidation {
3822 has_no_xfa,
3823 has_no_needs_rendering,
3824 has_no_acroform,
3825 page_count,
3826 warnings,
3827 })
3828}
3829
3830pub struct FlattenQualityMetrics {
3838 pub page_count_before: usize,
3840 pub page_count_after: usize,
3842 pub page_count_match: bool,
3844 pub content_stream_bytes_before: usize,
3846 pub content_stream_bytes_after: usize,
3848 pub content_ratio: f64,
3851}
3852
3853pub fn compare_flatten_quality(
3858 original_bytes: &[u8],
3859 flattened_bytes: &[u8],
3860) -> Result<FlattenQualityMetrics> {
3861 fn count_pages_and_stream_bytes(pdf_bytes: &[u8]) -> (usize, usize) {
3862 let doc = match Document::load_mem(pdf_bytes) {
3863 Ok(d) => d,
3864 Err(_) => return (0, 0),
3865 };
3866 let page_count = doc.page_iter().count();
3867 let stream_bytes: usize = doc
3868 .objects
3869 .values()
3870 .filter_map(|obj| {
3871 if let Object::Stream(s) = obj {
3872 s.content.len().into()
3874 } else {
3875 None
3876 }
3877 })
3878 .sum();
3879 (page_count, stream_bytes)
3880 }
3881
3882 let (page_count_before, content_stream_bytes_before) =
3883 count_pages_and_stream_bytes(original_bytes);
3884 let (page_count_after, content_stream_bytes_after) =
3885 count_pages_and_stream_bytes(flattened_bytes);
3886
3887 let content_ratio = if content_stream_bytes_before == 0 {
3888 1.0_f64
3889 } else {
3890 content_stream_bytes_after as f64 / content_stream_bytes_before as f64
3891 };
3892
3893 Ok(FlattenQualityMetrics {
3894 page_count_before,
3895 page_count_after,
3896 page_count_match: page_count_before == page_count_after,
3897 content_stream_bytes_before,
3898 content_stream_bytes_after,
3899 content_ratio,
3900 })
3901}
3902
3903pub struct TextValidation {
3913 pub expected_values: Vec<String>,
3915 pub found_values: Vec<String>,
3917 pub missing_values: Vec<String>,
3919 pub completeness_ratio: f64,
3922}
3923
3924fn extract_field_values_from_datasets(datasets_xml: &str) -> Vec<String> {
3927 let mut values = Vec::new();
3931 let mut remaining = datasets_xml;
3932
3933 while let Some(open_pos) = remaining.find("<field") {
3934 let tag_end = match remaining[open_pos..].find('>') {
3936 Some(p) => open_pos + p + 1,
3937 None => break,
3938 };
3939
3940 if remaining[open_pos..tag_end].ends_with("/>") {
3942 remaining = &remaining[tag_end..];
3943 continue;
3944 }
3945
3946 let close_tag = "</field>";
3948 match remaining[tag_end..].find(close_tag) {
3949 Some(close_pos) => {
3950 let inner = &remaining[tag_end..tag_end + close_pos];
3951 let text = extract_innermost_text(inner);
3954 if !text.is_empty() {
3955 values.push(text);
3956 }
3957 remaining = &remaining[tag_end + close_pos + close_tag.len()..];
3958 }
3959 None => break,
3960 }
3961 }
3962 values
3963}
3964
3965fn extract_innermost_text(inner: &str) -> String {
3968 if let Some(start) = inner.find("<text>") {
3970 let content_start = start + "<text>".len();
3971 if let Some(end) = inner[content_start..].find("</text>") {
3972 let s = inner[content_start..content_start + end].trim().to_string();
3973 if !s.is_empty() {
3974 return s;
3975 }
3976 }
3977 }
3978 let stripped = strip_xml_tags(inner);
3980 stripped.trim().to_string()
3981}
3982
3983fn strip_xml_tags(s: &str) -> String {
3985 let mut out = String::with_capacity(s.len());
3986 let mut in_tag = false;
3987 for ch in s.chars() {
3988 match ch {
3989 '<' => in_tag = true,
3990 '>' => in_tag = false,
3991 _ if !in_tag => out.push(ch),
3992 _ => {}
3993 }
3994 }
3995 out
3996}
3997
3998fn extract_text_from_pdf_bytes(pdf_bytes: &[u8]) -> String {
4005 let doc = match Document::load_mem(pdf_bytes) {
4006 Ok(d) => d,
4007 Err(_) => return String::new(),
4008 };
4009
4010 let mut text = String::new();
4011
4012 for obj in doc.objects.values() {
4013 if let Object::Stream(ref stream) = obj {
4014 let content = match stream.decompressed_content() {
4016 Ok(c) => c,
4017 Err(_) => stream.content.clone(),
4018 };
4019 let fragment = extract_text_from_content_stream(&content);
4020 if !fragment.is_empty() {
4021 text.push(' ');
4022 text.push_str(&fragment);
4023 }
4024 }
4025 }
4026 text
4027}
4028
4029fn extract_text_from_content_stream(content: &[u8]) -> String {
4032 let s = String::from_utf8_lossy(content);
4033 let mut result = String::new();
4034
4035 for (i, ch) in s.char_indices() {
4038 if ch == '(' {
4039 let start = i + 1;
4041 let mut depth: i32 = 1;
4042 let mut end = start;
4043 let bytes = s.as_bytes();
4044 while end < bytes.len() && depth > 0 {
4045 match bytes[end] {
4046 b'(' => depth += 1,
4047 b')' => depth -= 1,
4048 b'\\' => {
4049 end += 1; }
4051 _ => {}
4052 }
4053 end += 1;
4054 }
4055 if depth == 0 {
4056 let literal = &s[start..end - 1];
4057 if literal.chars().all(|c| {
4059 c.is_ascii()
4060 && (c.is_alphanumeric() || c.is_whitespace() || c.is_ascii_punctuation())
4061 }) {
4062 let trimmed = literal.trim();
4063 if !trimmed.is_empty() {
4064 result.push(' ');
4065 result.push_str(trimmed);
4066 }
4067 }
4068 }
4069 }
4070 }
4071 result
4072}
4073
4074pub fn validate_text_completeness(
4087 original_xfa_bytes: &[u8],
4088 flattened_bytes: &[u8],
4089) -> crate::error::Result<TextValidation> {
4090 let packets = match crate::extract::extract_xfa_from_bytes(original_xfa_bytes.to_vec()) {
4092 Ok(p) => p,
4093 Err(_) => {
4094 return Ok(TextValidation {
4096 expected_values: vec![],
4097 found_values: vec![],
4098 missing_values: vec![],
4099 completeness_ratio: 1.0,
4100 });
4101 }
4102 };
4103
4104 let datasets_xml = match packets.datasets() {
4105 Some(ds) => ds.to_string(),
4106 None => {
4107 return Ok(TextValidation {
4108 expected_values: vec![],
4109 found_values: vec![],
4110 missing_values: vec![],
4111 completeness_ratio: 1.0,
4112 });
4113 }
4114 };
4115
4116 let expected_values = extract_field_values_from_datasets(&datasets_xml);
4118
4119 if expected_values.is_empty() {
4120 return Ok(TextValidation {
4121 expected_values: vec![],
4122 found_values: vec![],
4123 missing_values: vec![],
4124 completeness_ratio: 1.0,
4125 });
4126 }
4127
4128 let output_text = extract_text_from_pdf_bytes(flattened_bytes);
4130
4131 let mut found_values = Vec::new();
4133 let mut missing_values = Vec::new();
4134
4135 for value in &expected_values {
4136 if output_text.contains(value.as_str()) {
4137 found_values.push(value.clone());
4138 } else {
4139 missing_values.push(value.clone());
4140 }
4141 }
4142
4143 let completeness_ratio = if expected_values.is_empty() {
4144 1.0
4145 } else {
4146 found_values.len() as f64 / expected_values.len() as f64
4147 };
4148
4149 Ok(TextValidation {
4150 expected_values,
4151 found_values,
4152 missing_values,
4153 completeness_ratio,
4154 })
4155}
4156
4157#[cfg(test)]
4168fn flatten_xfa_to_pdf_simulate_reentrant(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
4169 FLATTEN_DEPTH.with(|d| d.set(1));
4170 let result = flatten_xfa_to_pdf(pdf_bytes);
4171 FLATTEN_DEPTH.with(|d| d.set(0));
4174 result
4175}
4176
4177#[cfg(test)]
4178mod tests {
4179 use super::*;
4180
4181 fn build_xfa_pdf_with_content(xdp: &str, page_content: Vec<u8>) -> Vec<u8> {
4183 use lopdf::{dictionary, Document, Object, Stream};
4184 let mut doc = Document::with_version("1.4");
4185 let xdp_bytes = xdp.as_bytes().to_vec();
4186 let xfa_stream = Stream::new(
4187 dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4188 xdp_bytes,
4189 );
4190 let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4191 let pages_id = doc.new_object_id();
4192 let content_stream = Stream::new(
4193 dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4194 page_content,
4195 );
4196 let content_id = doc.add_object(Object::Stream(content_stream));
4197 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4198 "Type" => Object::Name(b"Page".to_vec()),
4199 "Parent" => Object::Reference(pages_id),
4200 "MediaBox" => Object::Array(vec![
4201 Object::Integer(0), Object::Integer(0),
4202 Object::Integer(612), Object::Integer(792),
4203 ]),
4204 "Contents" => Object::Reference(content_id)
4205 }));
4206 doc.objects.insert(
4207 pages_id,
4208 Object::Dictionary(dictionary! {
4209 "Type" => Object::Name(b"Pages".to_vec()),
4210 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4211 "Count" => Object::Integer(1)
4212 }),
4213 );
4214 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4215 "XFA" => Object::Reference(xfa_id),
4216 "Fields" => Object::Array(vec![])
4217 }));
4218 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4219 "Type" => Object::Name(b"Catalog".to_vec()),
4220 "Pages" => Object::Reference(pages_id),
4221 "AcroForm" => Object::Reference(acroform_id)
4222 }));
4223 doc.trailer.set("Root", Object::Reference(catalog_id));
4224 let mut out = Vec::new();
4225 doc.save_to(&mut out).unwrap();
4226 out
4227 }
4228
4229 fn build_xfa_pdf(xdp: &str) -> Vec<u8> {
4230 build_xfa_pdf_with_content(xdp, Vec::new())
4231 }
4232
4233 fn build_xfa_doc_with_xfa_array() -> (Document, ObjectId, Vec<ObjectId>) {
4234 use lopdf::{dictionary, Document, Object, Stream};
4235
4236 let mut doc = Document::with_version("1.4");
4237 let pages_id = doc.new_object_id();
4238 let content_id = doc.add_object(Object::Stream(Stream::new(
4239 dictionary! { "Length" => Object::Integer(0) },
4240 Vec::new(),
4241 )));
4242 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4243 "Type" => Object::Name(b"Page".to_vec()),
4244 "Parent" => Object::Reference(pages_id),
4245 "MediaBox" => Object::Array(vec![
4246 Object::Integer(0), Object::Integer(0),
4247 Object::Integer(612), Object::Integer(792),
4248 ]),
4249 "Contents" => Object::Reference(content_id)
4250 }));
4251 doc.objects.insert(
4252 pages_id,
4253 Object::Dictionary(dictionary! {
4254 "Type" => Object::Name(b"Pages".to_vec()),
4255 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4256 "Count" => Object::Integer(1)
4257 }),
4258 );
4259
4260 let packet_payloads = [
4261 (
4262 b"xdp:xdp".to_vec(),
4263 br#"<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"></xdp:xdp>"#.to_vec(),
4264 ),
4265 (
4266 b"template".to_vec(),
4267 br#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform/></template>"#
4268 .to_vec(),
4269 ),
4270 (
4271 b"datasets".to_vec(),
4272 br#"<xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/"></xfa:datasets>"#
4273 .to_vec(),
4274 ),
4275 ];
4276
4277 let mut xfa_array = Vec::new();
4278 let mut xfa_ids = Vec::new();
4279 for (packet_name, payload) in packet_payloads {
4280 let stream_id = doc.add_object(Object::Stream(Stream::new(
4281 dictionary! { "Length" => Object::Integer(payload.len() as i64) },
4282 payload,
4283 )));
4284 xfa_array.push(Object::Name(packet_name));
4285 xfa_array.push(Object::Reference(stream_id));
4286 xfa_ids.push(stream_id);
4287 }
4288
4289 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4290 "XFA" => Object::Array(xfa_array),
4291 "Fields" => Object::Array(vec![])
4292 }));
4293 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4294 "Type" => Object::Name(b"Catalog".to_vec()),
4295 "Pages" => Object::Reference(pages_id),
4296 "AcroForm" => Object::Reference(acroform_id)
4297 }));
4298 doc.trailer.set("Root", Object::Reference(catalog_id));
4299 (doc, acroform_id, xfa_ids)
4300 }
4301
4302 fn build_xfa_pdf_with_widget_appearance(
4303 page_content: Vec<u8>,
4304 normal_appearance: Object,
4305 widget_extra: Dictionary,
4306 ) -> Vec<u8> {
4307 use lopdf::{dictionary, Document, Object, Stream};
4308
4309 let mut doc = Document::with_version("1.4");
4310 let xdp_bytes = SIMPLE_XDP.as_bytes().to_vec();
4311 let xfa_stream = Stream::new(
4312 dictionary! { "Length" => Object::Integer(xdp_bytes.len() as i64) },
4313 xdp_bytes,
4314 );
4315 let xfa_id = doc.add_object(Object::Stream(xfa_stream));
4316
4317 let pages_id = doc.new_object_id();
4318 let content_id = doc.add_object(Object::Stream(Stream::new(
4319 dictionary! { "Length" => Object::Integer(page_content.len() as i64) },
4320 page_content,
4321 )));
4322
4323 let appearance_id = match normal_appearance {
4324 Object::Reference(id) => id,
4325 other => doc.add_object(other),
4326 };
4327
4328 let widget_id = doc.new_object_id();
4329 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4330 "Type" => Object::Name(b"Page".to_vec()),
4331 "Parent" => Object::Reference(pages_id),
4332 "MediaBox" => Object::Array(vec![
4333 Object::Integer(0), Object::Integer(0),
4334 Object::Integer(612), Object::Integer(792),
4335 ]),
4336 "Contents" => Object::Reference(content_id),
4337 "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
4338 "Resources" => Object::Dictionary(dictionary! {})
4339 }));
4340
4341 let mut widget = dictionary! {
4342 "Type" => Object::Name(b"Annot".to_vec()),
4343 "Subtype" => Object::Name(b"Widget".to_vec()),
4344 "Rect" => Object::Array(vec![
4345 Object::Integer(100), Object::Integer(700),
4346 Object::Integer(220), Object::Integer(730),
4347 ]),
4348 "AP" => Object::Dictionary(dictionary! {
4349 "N" => Object::Reference(appearance_id)
4350 }),
4351 "P" => Object::Reference(page_id)
4352 };
4353 for (key, value) in widget_extra {
4354 widget.set(key, value);
4355 }
4356 doc.objects.insert(widget_id, Object::Dictionary(widget));
4357
4358 doc.objects.insert(
4359 pages_id,
4360 Object::Dictionary(dictionary! {
4361 "Type" => Object::Name(b"Pages".to_vec()),
4362 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4363 "Count" => Object::Integer(1)
4364 }),
4365 );
4366
4367 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
4368 "XFA" => Object::Reference(xfa_id),
4369 "Fields" => Object::Array(vec![Object::Reference(widget_id)])
4370 }));
4371 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4372 "Type" => Object::Name(b"Catalog".to_vec()),
4373 "Pages" => Object::Reference(pages_id),
4374 "AcroForm" => Object::Reference(acroform_id)
4375 }));
4376 doc.trailer.set("Root", Object::Reference(catalog_id));
4377
4378 let mut out = Vec::new();
4379 doc.save_to(&mut out).unwrap();
4380 out
4381 }
4382
4383 #[allow(dead_code)]
4384 fn find_last_content_stream<'a>(doc: &'a Document, page_id: ObjectId) -> &'a Stream {
4385 let page_dict = doc.get_dictionary(page_id).expect("page dict");
4386 match page_dict.get(b"Contents").expect("contents") {
4387 Object::Reference(id) => doc
4388 .get_object(*id)
4389 .expect("contents object")
4390 .as_stream()
4391 .expect("contents stream"),
4392 Object::Array(arr) => {
4393 let last = arr.last().expect("last content stream");
4394 let id = last.as_reference().expect("contents ref");
4395 doc.get_object(id)
4396 .expect("contents object")
4397 .as_stream()
4398 .expect("contents stream")
4399 }
4400 other => other.as_stream().expect("contents stream"),
4401 }
4402 }
4403
4404 #[allow(dead_code)]
4405 fn page_xobjects(doc: &Document, page_id: ObjectId) -> Dictionary {
4406 let page_dict = doc.get_dictionary(page_id).expect("page dict");
4407 let resources = page_dict
4408 .get(b"Resources")
4409 .expect("resources")
4410 .as_dict()
4411 .expect("resources dict");
4412 resources
4413 .get(b"XObject")
4414 .expect("xobjects")
4415 .as_dict()
4416 .expect("xobject dict")
4417 .clone()
4418 }
4419
4420 #[test]
4421 fn append_to_page_content_flattens_indirect_contents_arrays() {
4422 let mut doc = Document::with_version("1.4");
4423 let pages_id = doc.new_object_id();
4424 let first_stream_id = doc.add_object(Stream::new(dictionary! {}, b"q\n".to_vec()));
4425 let second_stream_id = doc.add_object(Stream::new(dictionary! {}, b"Q\n".to_vec()));
4426 let contents_array_id = doc.add_object(Object::Array(vec![
4427 Object::Reference(first_stream_id),
4428 Object::Reference(second_stream_id),
4429 ]));
4430 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4431 "Type" => Object::Name(b"Page".to_vec()),
4432 "Parent" => Object::Reference(pages_id),
4433 "MediaBox" => Object::Array(vec![
4434 Object::Integer(0), Object::Integer(0),
4435 Object::Integer(612), Object::Integer(792),
4436 ]),
4437 "Contents" => Object::Reference(contents_array_id),
4438 }));
4439 doc.objects.insert(
4440 pages_id,
4441 Object::Dictionary(dictionary! {
4442 "Type" => Object::Name(b"Pages".to_vec()),
4443 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4444 "Count" => Object::Integer(1),
4445 }),
4446 );
4447
4448 append_to_page_content(&mut doc, page_id, b"BT\nET\n");
4449
4450 let page_dict = doc.get_dictionary(page_id).expect("page dict");
4451 let contents = page_dict.get(b"Contents").expect("contents");
4452 let items = contents.as_array().expect("flattened contents array");
4453
4454 assert_eq!(items.len(), 3, "existing streams + appended stream");
4455 assert!(
4456 items.iter().all(|obj| obj.as_reference().is_ok()),
4457 "contents array must stay flat and reference only streams"
4458 );
4459 for object in items {
4460 let stream_id = object.as_reference().expect("stream ref");
4461 assert!(
4462 doc.get_object(stream_id)
4463 .expect("stream object")
4464 .as_stream()
4465 .is_ok(),
4466 "nested arrays must not survive in page contents"
4467 );
4468 }
4469 }
4470
4471 const SIMPLE_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
4472<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4473<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
4474 <subform name="form1" layout="paginate">
4475 <pageSet>
4476 <pageArea name="Page1">
4477 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4478 <medium stock="default" short="8.5in" long="11in"/>
4479 </pageArea>
4480 </pageSet>
4481 <subform name="section" layout="tb" w="7.5in">
4482 <field name="firstName" w="3.5in" h="0.3in">
4483 <caption><value><text>First Name</text></value></caption>
4484 <ui><textEdit/></ui>
4485 <value><text>John</text></value>
4486 </field>
4487 </subform>
4488 </subform>
4489</template>
4490</xdp:xdp>"#;
4491
4492 const JS_EVENT_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
4493<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4494<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
4495 <subform name="form1" layout="paginate">
4496 <pageSet>
4497 <pageArea name="Page1">
4498 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4499 <medium stock="default" short="8.5in" long="11in"/>
4500 </pageArea>
4501 </pageSet>
4502 <subform name="section" layout="tb" w="7.5in">
4503 <event activity="initialize">
4504 <script contentType="application/x-javascript">app.alert('blocked');</script>
4505 </event>
4506 <field name="firstName" w="3.5in" h="0.3in">
4507 <caption><value><text>First Name</text></value></caption>
4508 <ui><textEdit/></ui>
4509 <value><text>John</text></value>
4510 </field>
4511 </subform>
4512 </subform>
4513</template>
4514</xdp:xdp>"#;
4515
4516 fn overflowing_paginate_xdp(base_profile: Option<&str>) -> String {
4517 let mut fields = String::new();
4518 for i in 0..40 {
4519 fields.push_str(&format!(
4520 r#"
4521 <field name="line{i}" w="7.0in" h="0.3in">
4522 <ui><textEdit/></ui>
4523 <value><text>Line {i}</text></value>
4524 </field>"#
4525 ));
4526 }
4527
4528 let base_profile_attr = base_profile
4529 .map(|value| format!(r#" baseProfile="{value}""#))
4530 .unwrap_or_default();
4531
4532 format!(
4533 r#"<?xml version="1.0" encoding="UTF-8"?>
4534<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4535<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"{base_profile_attr}>
4536 <subform name="form1" layout="paginate">
4537 <pageSet>
4538 <pageArea name="Page1">
4539 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4540 <medium stock="default" short="8.5in" long="11in"/>
4541 </pageArea>
4542 </pageSet>
4543 <subform name="section" layout="tb" w="7.5in">{fields}
4544 </subform>
4545 </subform>
4546</template>
4547</xdp:xdp>"#
4548 )
4549 }
4550
4551 #[test]
4552 fn flatten_simple_form_produces_non_empty_content() {
4553 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
4554 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4555
4556 let doc = Document::load_mem(&result).expect("load flattened PDF");
4558 let pages: Vec<ObjectId> = doc.page_iter().collect();
4559 assert!(!pages.is_empty(), "flattened PDF has no pages");
4560
4561 let mut found_content = false;
4563 for page_id in &pages {
4564 if let Ok(page_dict) = doc.get_dictionary(*page_id) {
4565 if let Ok(contents_ref) = page_dict.get(b"Contents") {
4566 if let Object::Reference(stream_id) = contents_ref {
4567 if let Ok(obj) = doc.get_object(*stream_id) {
4568 if let Ok(stream) = obj.as_stream() {
4569 if !stream.content.is_empty() {
4570 found_content = true;
4571 }
4572 }
4573 }
4574 }
4575 }
4576 }
4577 }
4578 assert!(found_content, "all content streams are empty after flatten");
4579 }
4580
4581 #[test]
4582 fn flatten_reports_best_effort_for_xfa_javascript_event() {
4583 let pdf_bytes = build_xfa_pdf(JS_EVENT_XDP);
4584
4585 let (flattened, metadata) =
4586 flatten_xfa_to_pdf_with_metadata(&pdf_bytes).expect("flatten should skip JS");
4587
4588 assert!(!flattened.is_empty());
4589 assert_eq!(metadata.output_quality, OutputQuality::BestEffort);
4590 assert!(metadata.dynamic_scripts.js_present);
4591 assert_eq!(metadata.dynamic_scripts.js_skipped, 1);
4592 }
4593
4594 #[test]
4595 fn flatten_strips_catalog_open_action_javascript() {
4596 let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
4597 {
4598 let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
4599 let root_id = match doc.trailer.get(b"Root") {
4600 Ok(Object::Reference(id)) => *id,
4601 _ => panic!("no Root in test PDF"),
4602 };
4603 if let Ok(Object::Dictionary(catalog)) = doc.get_object_mut(root_id) {
4604 catalog.set(
4605 "OpenAction",
4606 Object::Dictionary(dictionary! {
4607 "S" => Object::Name(b"JavaScript".to_vec()),
4608 "JS" => Object::String(
4609 b"app.alert('blocked')".to_vec(),
4610 lopdf::StringFormat::Literal,
4611 ),
4612 }),
4613 );
4614 }
4615 let mut out = Vec::new();
4616 doc.save_to(&mut out).expect("save test PDF");
4617 pdf_bytes = out;
4618 }
4619
4620 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4621 let doc = Document::load_mem(&flattened).expect("load flattened PDF");
4622 let root_id = match doc.trailer.get(b"Root") {
4623 Ok(Object::Reference(id)) => *id,
4624 _ => panic!("no Root in flattened PDF"),
4625 };
4626 let catalog = doc.get_dictionary(root_id).expect("catalog dict");
4627 assert!(
4628 catalog.get(b"OpenAction").is_err(),
4629 "/OpenAction JavaScript must be stripped from flattened output"
4630 );
4631 }
4632
4633 #[test]
4639 fn flatten_paginate_subform_with_nested_pageset_produces_visible_content() {
4640 const LR_TB_XDP: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
4641<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
4642<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
4643 <subform name="form1" layout="paginate" locale="en_US">
4644 <pageSet>
4645 <pageArea name="Page1" id="Page1">
4646 <contentArea x="0.5in" y="0.5in" w="7.5in" h="10in"/>
4647 <medium stock="default" short="8.5in" long="11in"/>
4648 </pageArea>
4649 </pageSet>
4650 <subform name="row1" layout="lr-tb" w="7.5in" h="0.4in">
4651 <field name="firstName" w="3.5in" h="0.4in">
4652 <caption><value><text>First</text></value></caption>
4653 <ui><textEdit/></ui>
4654 <value><text>John</text></value>
4655 </field>
4656 <field name="lastName" w="3.5in" h="0.4in">
4657 <caption><value><text>Last</text></value></caption>
4658 <ui><textEdit/></ui>
4659 <value><text>Doe</text></value>
4660 </field>
4661 </subform>
4662 </subform>
4663</template>
4664</xdp:xdp>"#;
4665
4666 let pdf_bytes = build_xfa_pdf(LR_TB_XDP);
4667 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4668
4669 let doc = Document::load_mem(&result).expect("load flattened PDF");
4670 let pages: Vec<ObjectId> = doc.page_iter().collect();
4671
4672 assert_eq!(pages.len(), 1, "expected 1 page, got {}", pages.len());
4674
4675 if let Ok(page_dict) = doc.get_dictionary(pages[0]) {
4678 if let Ok(lopdf::Object::Reference(stream_id)) = page_dict.get(b"Contents") {
4679 if let Ok(obj) = doc.get_object(*stream_id) {
4680 if let Ok(stream) = obj.as_stream() {
4681 let content = String::from_utf8_lossy(&stream.content);
4682 assert!(
4683 content.contains("BT\n"),
4684 "no text operators in page 1 content stream (should have BT from field values)"
4685 );
4686 assert!(
4687 content.contains("Tj\n"),
4688 "no text show operators in page 1 content stream"
4689 );
4690 }
4691 }
4692 }
4693 }
4694 }
4695
4696 #[test]
4697 fn static_single_page_pdf_does_not_append_xfa_overflow_pages() {
4698 let xdp = overflowing_paginate_xdp(Some("interactiveForms"));
4699 let pdf_bytes = build_xfa_pdf(&xdp);
4700 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4701
4702 let doc = Document::load_mem(&result).expect("load flattened PDF");
4703 let pages: Vec<ObjectId> = doc.page_iter().collect();
4704
4705 assert_eq!(
4706 pages.len(),
4707 1,
4708 "static 1-page PDFs should preserve the original page when XFA layout over-paginates"
4709 );
4710 }
4711
4712 #[test]
4713 fn dynamic_single_page_pdf_can_expand_beyond_original_page_count() {
4714 let xdp = overflowing_paginate_xdp(None);
4719 let pdf_bytes = build_xfa_pdf(&xdp);
4720 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4721
4722 let doc = Document::load_mem(&result).expect("load flattened PDF");
4723 let pages: Vec<ObjectId> = doc.page_iter().collect();
4724
4725 assert_eq!(
4726 pages.len(),
4727 2,
4728 "dynamic 1-page PDFs should be allowed to grow when XFA layout paginates"
4729 );
4730 }
4731
4732 #[test]
4733 fn flatten_removes_acroform() {
4734 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
4735 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4736 let doc = Document::load_mem(&result).expect("load flattened PDF");
4737 let root_id = doc.trailer.get(b"Root").unwrap().as_reference().unwrap();
4738 let catalog = doc.get_dictionary(root_id).unwrap();
4739 assert!(
4740 catalog.get(b"AcroForm").is_err(),
4741 "/AcroForm still present after flatten"
4742 );
4743 }
4744
4745 #[test]
4746 fn flatten_non_xfa_pdf_unchanged() {
4747 let mut doc = Document::with_version("1.4");
4749 let pages_id = doc.new_object_id();
4750 let page_id = doc.add_object(Object::Dictionary(dictionary! {
4751 "Type" => Object::Name(b"Page".to_vec()),
4752 "Parent" => Object::Reference(pages_id),
4753 "MediaBox" => Object::Array(vec![
4754 Object::Integer(0), Object::Integer(0),
4755 Object::Integer(612), Object::Integer(792),
4756 ])
4757 }));
4758 doc.objects.insert(
4759 pages_id,
4760 Object::Dictionary(dictionary! {
4761 "Type" => Object::Name(b"Pages".to_vec()),
4762 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
4763 "Count" => Object::Integer(1)
4764 }),
4765 );
4766 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
4767 "Type" => Object::Name(b"Catalog".to_vec()),
4768 "Pages" => Object::Reference(pages_id)
4769 }));
4770 doc.trailer.set("Root", Object::Reference(catalog_id));
4771 let mut raw = Vec::new();
4772 doc.save_to(&mut raw).unwrap();
4773
4774 let result = flatten_xfa_to_pdf(&raw).expect("flatten non-XFA failed");
4776 assert!(!result.is_empty());
4777 }
4778
4779 #[test]
4780 fn placeholder_only_page_does_not_trigger_static_strip_path() {
4781 const PLACEHOLDER_STREAM: &str = r#"BT
4782/Helv 24 Tf
478372 720 Td
4784(Please wait...) Tj
47850 -32 Td
4786(If this message is not eventually replaced by the proper contents of the document,) Tj
47870 -32 Td
4788(your PDF viewer may not be able to display this type of document.) Tj
47890 -32 Td
4790(You can upgrade to the latest version of Adobe Reader by visiting reader_download.) Tj
4791ET
4792"#;
4793
4794 let pdf_bytes =
4795 build_xfa_pdf_with_content(SIMPLE_XDP, PLACEHOLDER_STREAM.as_bytes().to_vec());
4796 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4797
4798 let doc = Document::load_mem(&result).expect("load flattened PDF");
4799 let page_id = doc.page_iter().next().expect("flattened page");
4800 let page_dict = doc.get_dictionary(page_id).expect("page dict");
4801 let contents_id = page_dict
4802 .get(b"Contents")
4803 .ok()
4804 .and_then(|object| object.as_reference().ok())
4805 .expect("contents ref");
4806 let stream = doc
4807 .get_object(contents_id)
4808 .expect("contents object")
4809 .as_stream()
4810 .expect("contents stream");
4811 let content = String::from_utf8_lossy(&stream.content);
4812
4813 assert!(
4814 content.contains("John"),
4815 "flattened page should contain XFA-rendered field content"
4816 );
4817 assert!(
4818 !content.contains("Please wait"),
4819 "placeholder text should not survive XFA flattening"
4820 );
4821 }
4822
4823 #[test]
4824 fn hybrid_static_pdf_uses_xfa_layout_over_static_content() {
4825 let appearance = Object::Stream(Stream::new(
4829 dictionary! {
4830 "Type" => Object::Name(b"XObject".to_vec()),
4831 "Subtype" => Object::Name(b"Form".to_vec()),
4832 "BBox" => Object::Array(vec![
4833 Object::Integer(0), Object::Integer(0),
4834 Object::Integer(120), Object::Integer(30),
4835 ]),
4836 "Matrix" => Object::Array(vec![
4837 Object::Integer(1), Object::Integer(0),
4838 Object::Integer(0), Object::Integer(1),
4839 Object::Integer(0), Object::Integer(0),
4840 ]),
4841 "Resources" => Object::Dictionary(dictionary! {}),
4842 },
4843 b"0 G\n0.5 0.5 119 29 re\ns\n".to_vec(),
4844 ));
4845 let page_content = b"BT /F1 12 Tf 72 720 Td (Line 1) Tj 0 -14 Td (Line 2) Tj 0 -14 Td (Line 3) Tj 0 -14 Td (Line 4) Tj 0 -14 Td (Line 5) Tj ET\n".to_vec();
4847 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
4848 page_content,
4849 appearance,
4850 dictionary! {
4851 "FT" => Object::Name(b"Tx".to_vec()),
4852 "T" => Object::string_literal("field[0]"),
4853 },
4854 );
4855
4856 let result = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
4857 let doc = Document::load_mem(&result).expect("load flattened PDF");
4858 let page_id = doc.page_iter().next().expect("page");
4859 let page_dict = doc.get_dictionary(page_id).expect("page dict");
4860
4861 assert!(
4863 page_dict.get(b"Annots").is_err(),
4864 "XFA-flattened page should have no annotations"
4865 );
4866 }
4867
4868 #[test]
4869 fn hybrid_static_pdf_uses_selected_button_appearance_state() {
4870 let yes_stream = Object::Stream(Stream::new(
4871 dictionary! {
4872 "Type" => Object::Name(b"XObject".to_vec()),
4873 "Subtype" => Object::Name(b"Form".to_vec()),
4874 "BBox" => Object::Array(vec![
4875 Object::Integer(0), Object::Integer(0),
4876 Object::Integer(20), Object::Integer(20),
4877 ]),
4878 "Matrix" => Object::Array(vec![
4879 Object::Integer(1), Object::Integer(0),
4880 Object::Integer(0), Object::Integer(1),
4881 Object::Integer(0), Object::Integer(0),
4882 ]),
4883 "Resources" => Object::Dictionary(dictionary! {}),
4884 },
4885 b"BT /F1 8 Tf 1 1 Td (YES) Tj ET\n".to_vec(),
4886 ));
4887 let off_stream = Object::Stream(Stream::new(
4888 dictionary! {
4889 "Type" => Object::Name(b"XObject".to_vec()),
4890 "Subtype" => Object::Name(b"Form".to_vec()),
4891 "BBox" => Object::Array(vec![
4892 Object::Integer(0), Object::Integer(0),
4893 Object::Integer(20), Object::Integer(20),
4894 ]),
4895 "Matrix" => Object::Array(vec![
4896 Object::Integer(1), Object::Integer(0),
4897 Object::Integer(0), Object::Integer(1),
4898 Object::Integer(0), Object::Integer(0),
4899 ]),
4900 "Resources" => Object::Dictionary(dictionary! {}),
4901 },
4902 b"BT /F1 8 Tf 1 1 Td (OFF) Tj ET\n".to_vec(),
4903 ));
4904
4905 let mut doc = Document::with_version("1.4");
4906 let state_id = doc.add_object(Object::Dictionary(dictionary! {
4907 "Yes" => yes_stream,
4908 "Off" => off_stream,
4909 }));
4910 let annot = dictionary! {
4911 "Subtype" => Object::Name(b"Widget".to_vec()),
4912 "Rect" => Object::Array(vec![
4913 Object::Integer(100), Object::Integer(700),
4914 Object::Integer(120), Object::Integer(720),
4915 ]),
4916 "AP" => Object::Dictionary(dictionary! {
4917 "N" => Object::Reference(state_id),
4918 }),
4919 "AS" => Object::Name(b"Yes".to_vec()),
4920 "FT" => Object::Name(b"Btn".to_vec()),
4921 };
4922 let ap_id =
4923 resolve_widget_normal_appearance(&mut doc, &annot).expect("selected normal appearance");
4924 let stream = doc
4925 .get_object(ap_id)
4926 .expect("appearance stream")
4927 .as_stream()
4928 .expect("appearance stream");
4929 let content = String::from_utf8_lossy(&stream.content);
4930
4931 assert!(
4932 content.contains("YES"),
4933 "flatten should choose the selected normal appearance state"
4934 );
4935 }
4936
4937 #[test]
4938 fn widget_as_off_without_off_appearance_returns_none() {
4939 let yes_stream = Object::Stream(Stream::new(
4943 dictionary! {
4944 "Type" => Object::Name(b"XObject".to_vec()),
4945 "Subtype" => Object::Name(b"Form".to_vec()),
4946 "BBox" => Object::Array(vec![
4947 Object::Integer(0), Object::Integer(0),
4948 Object::Integer(10), Object::Integer(10),
4949 ]),
4950 },
4951 b"q 5 5 m 5 5 l S Q\n".to_vec(),
4952 ));
4953
4954 let mut doc = Document::with_version("1.4");
4955 let state_id = doc.add_object(Object::Dictionary(dictionary! {
4957 "0" => yes_stream,
4958 }));
4959 let annot = dictionary! {
4960 "Subtype" => Object::Name(b"Widget".to_vec()),
4961 "Rect" => Object::Array(vec![
4962 Object::Integer(100), Object::Integer(700),
4963 Object::Integer(110), Object::Integer(710),
4964 ]),
4965 "AP" => Object::Dictionary(dictionary! {
4966 "N" => Object::Reference(state_id),
4967 }),
4968 "AS" => Object::Name(b"Off".to_vec()),
4969 "FT" => Object::Name(b"Btn".to_vec()),
4970 };
4971 assert!(
4972 resolve_widget_normal_appearance(&mut doc, &annot).is_none(),
4973 "Off state with no Off appearance should not resolve to the on-state stream"
4974 );
4975 }
4976
4977 #[test]
4978 fn bake_checkbox_radio_ap_marks_skips_off_widgets_without_off_normal_appearance() {
4979 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
4980 Vec::new(),
4981 Object::Dictionary(dictionary! {
4982 "1" => Object::Stream(Stream::new(
4983 dictionary! {
4984 "Type" => Object::Name(b"XObject".to_vec()),
4985 "Subtype" => Object::Name(b"Form".to_vec()),
4986 "BBox" => Object::Array(vec![
4987 Object::Integer(0), Object::Integer(0),
4988 Object::Integer(10), Object::Integer(10),
4989 ]),
4990 "Resources" => Object::Dictionary(dictionary! {}),
4991 },
4992 b"q 1 1 8 8 re W n 2 8 m 8 2 l 8 8 m 2 2 l s Q\n".to_vec(),
4993 )),
4994 }),
4995 dictionary! {
4996 "FT" => Object::Name(b"Btn".to_vec()),
4997 "AS" => Object::Name(b"Off".to_vec()),
4998 "T" => Object::string_literal("checkbox[0]"),
4999 },
5000 );
5001
5002 let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5003 let page_id = doc.page_iter().next().expect("page");
5004 let baked = bake_checkbox_radio_ap_marks(&mut doc, page_id);
5005
5006 assert_eq!(baked, 0, "Off-state widget must not stamp the on-mark");
5007 }
5008
5009 #[test]
5010 fn adding_widget_xobject_preserves_indirect_inline_page_xobjects() {
5011 let mut doc = Document::with_version("1.4");
5012 let existing_xobject_id = doc.add_object(Object::Stream(Stream::new(
5013 dictionary! {
5014 "Type" => Object::Name(b"XObject".to_vec()),
5015 "Subtype" => Object::Name(b"Form".to_vec()),
5016 "BBox" => Object::Array(vec![
5017 Object::Integer(0), Object::Integer(0),
5018 Object::Integer(10), Object::Integer(10),
5019 ]),
5020 },
5021 b"q Q\n".to_vec(),
5022 )));
5023 let xobject_dict_id = doc.add_object(Object::Dictionary(dictionary! {
5024 "R11" => Object::Reference(existing_xobject_id),
5025 }));
5026
5027 let pages_id = doc.new_object_id();
5028 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5029 "Type" => Object::Name(b"Page".to_vec()),
5030 "Parent" => Object::Reference(pages_id),
5031 "MediaBox" => Object::Array(vec![
5032 Object::Integer(0), Object::Integer(0),
5033 Object::Integer(612), Object::Integer(792),
5034 ]),
5035 "Resources" => Object::Dictionary(dictionary! {
5036 "XObject" => Object::Reference(xobject_dict_id),
5037 }),
5038 }));
5039 doc.objects.insert(
5040 pages_id,
5041 Object::Dictionary(dictionary! {
5042 "Type" => Object::Name(b"Pages".to_vec()),
5043 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5044 "Count" => Object::Integer(1)
5045 }),
5046 );
5047
5048 let new_xobject_id = doc.add_object(Object::Stream(Stream::new(
5049 dictionary! {
5050 "Type" => Object::Name(b"XObject".to_vec()),
5051 "Subtype" => Object::Name(b"Form".to_vec()),
5052 "BBox" => Object::Array(vec![
5053 Object::Integer(0), Object::Integer(0),
5054 Object::Integer(10), Object::Integer(10),
5055 ]),
5056 },
5057 b"0 0 10 10 re S\n".to_vec(),
5058 )));
5059
5060 add_xobject_to_page_resources(&mut doc, page_id, "XfaAp0", new_xobject_id);
5061
5062 let xobjects = doc
5063 .get_object(xobject_dict_id)
5064 .expect("xobject dict")
5065 .as_dict()
5066 .expect("xobject dict");
5067 assert!(
5068 xobjects.get(b"R11").is_ok(),
5069 "existing page XObject was lost"
5070 );
5071 assert!(
5072 xobjects.get(b"XfaAp0").is_ok(),
5073 "new flattened widget XObject was not added"
5074 );
5075 }
5076
5077 #[test]
5078 fn encrypted_pdf_without_xfa_returns_ok() {
5079 let mut doc = Document::with_version("1.4");
5081 let pages_id = doc.new_object_id();
5082 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5083 "Type" => Object::Name(b"Page".to_vec()),
5084 "Parent" => Object::Reference(pages_id),
5085 "MediaBox" => Object::Array(vec![
5086 Object::Integer(0), Object::Integer(0),
5087 Object::Integer(612), Object::Integer(792),
5088 ]),
5089 }));
5090 doc.objects.insert(
5091 pages_id,
5092 Object::Dictionary(dictionary! {
5093 "Type" => Object::Name(b"Pages".to_vec()),
5094 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5095 "Count" => Object::Integer(1),
5096 }),
5097 );
5098 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5099 "Type" => Object::Name(b"Catalog".to_vec()),
5100 "Pages" => Object::Reference(pages_id),
5101 }));
5102 doc.trailer.set("Root", Object::Reference(catalog_id));
5103
5104 let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5105 "Filter" => Object::Name(b"Standard".to_vec()),
5106 "V" => Object::Integer(2),
5107 "Length" => Object::Integer(128),
5108 }));
5109 doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5110
5111 let mut buf = Vec::new();
5112 doc.save_to(&mut buf).expect("save test PDF");
5113
5114 let result = flatten_xfa_to_pdf(&buf);
5115 assert!(result.is_ok(), "non-XFA encrypted PDF should return Ok");
5116 }
5117
5118 #[test]
5119 fn encrypted_xfa_pdf_returns_encrypted_error() {
5120 let mut doc = Document::with_version("1.4");
5123 let pages_id = doc.new_object_id();
5124 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5125 "Type" => Object::Name(b"Page".to_vec()),
5126 "Parent" => Object::Reference(pages_id),
5127 "MediaBox" => Object::Array(vec![
5128 Object::Integer(0), Object::Integer(0),
5129 Object::Integer(612), Object::Integer(792),
5130 ]),
5131 }));
5132 doc.objects.insert(
5133 pages_id,
5134 Object::Dictionary(dictionary! {
5135 "Type" => Object::Name(b"Pages".to_vec()),
5136 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5137 "Count" => Object::Integer(1),
5138 }),
5139 );
5140 let xfa_stream_id = doc.add_object(Object::Stream(lopdf::Stream::new(
5142 dictionary! {},
5143 b"<xdp:xdp></xdp:xdp>".to_vec(),
5144 )));
5145 let acroform_id = doc.add_object(Object::Dictionary(dictionary! {
5146 "XFA" => Object::Reference(xfa_stream_id),
5147 }));
5148 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5149 "Type" => Object::Name(b"Catalog".to_vec()),
5150 "Pages" => Object::Reference(pages_id),
5151 "AcroForm" => Object::Reference(acroform_id),
5152 }));
5153 doc.trailer.set("Root", Object::Reference(catalog_id));
5154
5155 let encrypt_id = doc.add_object(Object::Dictionary(dictionary! {
5156 "Filter" => Object::Name(b"Standard".to_vec()),
5157 "V" => Object::Integer(2),
5158 "Length" => Object::Integer(128),
5159 }));
5160 doc.trailer.set("Encrypt", Object::Reference(encrypt_id));
5161
5162 let mut buf = Vec::new();
5163 doc.save_to(&mut buf).expect("save encrypted PDF");
5164
5165 let result = flatten_xfa_to_pdf(&buf);
5166 assert!(result.is_err(), "expected Encrypted error");
5167 let err = result.unwrap_err();
5168 assert!(
5169 matches!(err, XfaError::Encrypted(_)),
5170 "expected XfaError::Encrypted, got: {err:?}"
5171 );
5172 }
5173
5174 #[test]
5175 fn owner_only_encrypted_pdf_is_handled_transparently() {
5176 let mut doc = Document::with_version("2.0");
5179 let pages_id = doc.new_object_id();
5180 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5181 "Type" => Object::Name(b"Page".to_vec()),
5182 "Parent" => Object::Reference(pages_id),
5183 "MediaBox" => Object::Array(vec![
5184 Object::Integer(0), Object::Integer(0),
5185 Object::Integer(612), Object::Integer(792),
5186 ]),
5187 }));
5188 doc.objects.insert(
5189 pages_id,
5190 Object::Dictionary(dictionary! {
5191 "Type" => Object::Name(b"Pages".to_vec()),
5192 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
5193 "Count" => Object::Integer(1),
5194 }),
5195 );
5196 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
5197 "Type" => Object::Name(b"Catalog".to_vec()),
5198 "Pages" => Object::Reference(pages_id),
5199 }));
5200 doc.trailer.set("Root", Object::Reference(catalog_id));
5201
5202 let state = lopdf::aes256_encryption_state("secret", "", lopdf::Permissions::default())
5204 .expect("create encryption state");
5205 doc.encrypt(&state).expect("encrypt document");
5206
5207 let mut buf = Vec::new();
5208 doc.save_to(&mut buf).expect("save encrypted PDF");
5209
5210 assert!(
5212 !is_pdf_encrypted(&buf),
5213 "lopdf should auto-decrypt owner-only PDFs"
5214 );
5215
5216 let result = flatten_xfa_to_pdf(&buf);
5218 assert!(
5219 result.is_ok(),
5220 "owner-only encrypted PDF should be handled, got: {result:?}"
5221 );
5222 }
5223
5224 fn build_pdf_with_cid_font(w_array: Vec<Object>, dw: Option<i64>) -> Document {
5226 let mut doc = Document::with_version("1.4");
5227
5228 let mut cid_dict = dictionary! {
5230 "Type" => Object::Name(b"Font".to_vec()),
5231 "Subtype" => Object::Name(b"CIDFontType2".to_vec()),
5232 "BaseFont" => Object::Name(b"TestFont".to_vec()),
5233 "W" => Object::Array(w_array)
5234 };
5235 if let Some(dw_val) = dw {
5236 cid_dict.set("DW", Object::Integer(dw_val));
5237 }
5238 let cid_id = doc.add_object(Object::Dictionary(cid_dict));
5239
5240 let type0_dict = dictionary! {
5242 "Type" => Object::Name(b"Font".to_vec()),
5243 "Subtype" => Object::Name(b"Type0".to_vec()),
5244 "BaseFont" => Object::Name(b"TestFont".to_vec()),
5245 "DescendantFonts" => Object::Array(vec![Object::Reference(cid_id)])
5246 };
5247 doc.add_object(Object::Dictionary(type0_dict));
5248 doc
5249 }
5250
5251 #[test]
5254 fn cid_w_array_consecutive() {
5255 let w = vec![
5256 Object::Integer(120),
5257 Object::Array(vec![
5258 Object::Integer(500),
5259 Object::Integer(600),
5260 Object::Integer(700),
5261 ]),
5262 ];
5263 let doc = build_pdf_with_cid_font(w, None);
5264 let _fonts = extract_embedded_fonts(&doc);
5265
5266 for obj in doc.objects.values() {
5269 let dict = match obj.as_dict() {
5270 Ok(d) => d,
5271 Err(_) => continue,
5272 };
5273 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5274 if subtype == Some(b"Type0".as_slice()) {
5275 let result = extract_cid_font_widths(&doc, dict);
5276 let (first, widths) = result.expect("should parse /W array");
5277 assert_eq!(first, 120);
5278 assert_eq!(widths.len(), 3);
5279 assert_eq!(widths[0], 500); assert_eq!(widths[1], 600); assert_eq!(widths[2], 700); return;
5283 }
5284 }
5285 panic!("Type0 font not found in test document");
5286 }
5287
5288 #[test]
5291 fn cid_w_array_range() {
5292 let w = vec![
5293 Object::Integer(200),
5294 Object::Integer(300),
5295 Object::Integer(250),
5296 ];
5297 let doc = build_pdf_with_cid_font(w, None);
5298
5299 for obj in doc.objects.values() {
5300 let dict = match obj.as_dict() {
5301 Ok(d) => d,
5302 Err(_) => continue,
5303 };
5304 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5305 if subtype == Some(b"Type0".as_slice()) {
5306 let (first, widths) =
5307 extract_cid_font_widths(&doc, dict).expect("should parse /W range");
5308 assert_eq!(first, 200);
5309 assert_eq!(widths.len(), 101); assert!(widths.iter().all(|&w| w == 250));
5311 return;
5312 }
5313 }
5314 panic!("Type0 font not found");
5315 }
5316
5317 #[test]
5322 fn cid_w_array_mixed() {
5323 let w = vec![
5324 Object::Integer(120),
5325 Object::Array(vec![
5326 Object::Integer(500),
5327 Object::Integer(600),
5328 Object::Integer(700),
5329 ]),
5330 Object::Integer(200),
5331 Object::Integer(300),
5332 Object::Integer(250),
5333 ];
5334 let doc = build_pdf_with_cid_font(w, Some(1000));
5335
5336 for obj in doc.objects.values() {
5337 let dict = match obj.as_dict() {
5338 Ok(d) => d,
5339 Err(_) => continue,
5340 };
5341 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5342 if subtype == Some(b"Type0".as_slice()) {
5343 let (first, widths) =
5344 extract_cid_font_widths(&doc, dict).expect("should parse mixed /W");
5345 assert_eq!(first, 120);
5346 assert_eq!(widths.len(), 181); assert_eq!(widths[0], 500); assert_eq!(widths[1], 600); assert_eq!(widths[2], 700); assert_eq!(widths[3], 1000); assert_eq!(widths[79], 1000); assert_eq!(widths[80], 250); assert_eq!(widths[180], 250); return;
5358 }
5359 }
5360 panic!("Type0 font not found");
5361 }
5362
5363 #[test]
5365 fn cid_w_array_default_width() {
5366 let w = vec![
5367 Object::Integer(10),
5368 Object::Array(vec![Object::Integer(400)]),
5369 Object::Integer(20),
5370 Object::Array(vec![Object::Integer(600)]),
5371 ];
5372 let doc = build_pdf_with_cid_font(w, None); for obj in doc.objects.values() {
5375 let dict = match obj.as_dict() {
5376 Ok(d) => d,
5377 Err(_) => continue,
5378 };
5379 let subtype = dict.get(b"Subtype").ok().and_then(|o| o.as_name().ok());
5380 if subtype == Some(b"Type0".as_slice()) {
5381 let (first, widths) = extract_cid_font_widths(&doc, dict).expect("should parse /W");
5382 assert_eq!(first, 10);
5383 assert_eq!(widths[0], 400); assert_eq!(widths[5], 1000); assert_eq!(widths[10], 600); return;
5387 }
5388 }
5389 panic!("Type0 font not found");
5390 }
5391
5392 #[test]
5393 fn extract_embedded_fonts_keeps_simple_pdf_fonts_without_fontfile() {
5394 let mut doc = Document::new();
5395 let font_id = doc.add_object(Object::Dictionary(dictionary! {
5396 "Type" => Object::Name(b"Font".to_vec()),
5397 "Subtype" => Object::Name(b"Type1".to_vec()),
5398 "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
5399 "FirstChar" => Object::Integer(32),
5400 "LastChar" => Object::Integer(34),
5401 "Widths" => Object::Array(vec![
5402 Object::Integer(278),
5403 Object::Integer(333),
5404 Object::Integer(612),
5405 ]),
5406 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
5407 }));
5408
5409 let fonts = extract_embedded_fonts(&doc);
5410 let font = fonts
5411 .iter()
5412 .find(|font| font.name == "MyriadPro-Regular")
5413 .expect("expected reusable simple font");
5414
5415 assert!(font.data.is_empty(), "no FontFile* should keep data empty");
5416 assert_eq!(font.pdf_widths, Some((32, vec![278, 333, 612])));
5417 assert_eq!(
5418 font.pdf_source_font,
5419 Some(PdfSourceFont { object_id: font_id })
5420 );
5421 }
5422
5423 #[test]
5424 fn store_font_data_reserves_family_alias_for_regular_face() {
5425 let mut fonts = Vec::new();
5426 store_font_data(
5427 &mut fonts,
5428 "ArialMT",
5429 Vec::new(),
5430 Some((32, vec![278, 333, 611])),
5431 None,
5432 Some(PdfSourceFont { object_id: (1, 0) }),
5433 );
5434 store_font_data(
5435 &mut fonts,
5436 "Arial-BoldMT",
5437 Vec::new(),
5438 Some((32, vec![278, 333, 611])),
5439 None,
5440 Some(PdfSourceFont { object_id: (2, 0) }),
5441 );
5442 store_font_data(
5443 &mut fonts,
5444 "Arial-ItalicMT",
5445 Vec::new(),
5446 Some((32, vec![278, 333, 611])),
5447 None,
5448 Some(PdfSourceFont { object_id: (3, 0) }),
5449 );
5450
5451 let aliases: Vec<_> = fonts.iter().map(|font| font.name.as_str()).collect();
5452 assert!(aliases.contains(&"ArialMT"));
5453 assert!(aliases.contains(&"Arial-BoldMT"));
5454 assert!(aliases.contains(&"Arial-ItalicMT"));
5455 assert_eq!(
5456 aliases.iter().filter(|name| **name == "Arial").count(),
5457 1,
5458 "only the regular face should claim the bare family alias"
5459 );
5460 }
5461
5462 #[test]
5463 fn store_font_data_keeps_regular_ps_family_alias() {
5464 let mut fonts = Vec::new();
5465 store_font_data(
5466 &mut fonts,
5467 "MyriadPro-Regular",
5468 Vec::new(),
5469 Some((32, vec![278, 333, 612])),
5470 None,
5471 Some(PdfSourceFont { object_id: (4, 0) }),
5472 );
5473
5474 assert!(
5475 fonts.iter().any(|font| font.name == "Myriad Pro"),
5476 "regular PostScript names should still expose their family alias"
5477 );
5478 }
5479
5480 #[test]
5481 fn page_content_streams_resolves_indirect_contents_arrays() {
5482 let mut doc = Document::new();
5483 let stream_a = doc.add_object(Stream::new(
5484 dictionary! {"Length" => Object::Integer(8)},
5485 b"(A) Tj\n".to_vec(),
5486 ));
5487 let stream_b = doc.add_object(Stream::new(
5488 dictionary! {"Length" => Object::Integer(8)},
5489 b"(B) Tj\n".to_vec(),
5490 ));
5491 let contents_array = doc.add_object(Object::Array(vec![
5492 Object::Reference(stream_a),
5493 Object::Reference(stream_b),
5494 ]));
5495 let page_id = doc.add_object(Object::Dictionary(dictionary! {
5496 "Type" => Object::Name(b"Page".to_vec()),
5497 "Contents" => Object::Reference(contents_array),
5498 }));
5499
5500 let streams = page_content_streams(&doc, page_id);
5501
5502 assert_eq!(
5503 streams.len(),
5504 2,
5505 "indirect /Contents arrays must be traversed"
5506 );
5507 assert!(streams[0].windows(2).any(|w| w == b"Tj"));
5508 assert!(streams[1].windows(2).any(|w| w == b"Tj"));
5509 }
5510
5511 #[test]
5512 fn embed_resolved_fonts_reuses_existing_pdf_font_object() {
5513 let mut doc = Document::new();
5514 let source_font_id = doc.add_object(Object::Dictionary(dictionary! {
5515 "Type" => Object::Name(b"Font".to_vec()),
5516 "Subtype" => Object::Name(b"Type1".to_vec()),
5517 "BaseFont" => Object::Name(b"MyriadPro-Regular".to_vec()),
5518 "Encoding" => Object::Name(b"WinAnsiEncoding".to_vec()),
5519 }));
5520 let before = doc.objects.len();
5521
5522 let mut resolved = HashMap::new();
5523 resolved.insert(
5524 "Myriad Pro_Normal_Normal".to_string(),
5525 ResolvedFont {
5526 name: "Myriad Pro".to_string(),
5527 data: Vec::new(),
5528 face_index: 0,
5529 units_per_em: 1000,
5530 ascender: 800,
5531 descender: -200,
5532 pdf_widths: Some((32, vec![278, 333, 612])),
5533 pdf_encoding: None,
5534 pdf_source_font: Some(PdfSourceFont {
5535 object_id: source_font_id,
5536 }),
5537 },
5538 );
5539
5540 let empty_layout = LayoutDom { pages: vec![] };
5541 let (_font_map, font_objects, metrics_data) =
5542 embed_resolved_fonts(&mut doc, &resolved, &empty_layout);
5543
5544 assert_eq!(
5545 doc.objects.len(),
5546 before,
5547 "should not embed a new font object"
5548 );
5549 assert_eq!(font_objects.len(), 1);
5550 assert_eq!(font_objects[0].1, source_font_id);
5551 assert!(
5552 metrics_data["Myriad Pro_Normal_Normal"].font_data.is_none(),
5553 "reused simple fonts must keep WinAnsi text encoding"
5554 );
5555 }
5556
5557 #[test]
5558 fn strip_undefined_entities_preserves_raw_ampersands_in_processing_instructions() {
5559 let xml = r##"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><?renderCache.textRun 24 A. Adjustment & Location 0 1417 14917 0 0 0 "Myriad Pro" 0 0 18000 ISO-8859-1?><?renderCache.subset "Arial" 0 0 ISO-8859-1 "#$%&'()+,-./" ?><subform name="form1"><field name="A"/></subform></template>"##;
5560
5561 let stripped = strip_undefined_xml_entities(xml);
5562
5563 assert_eq!(
5564 stripped, xml,
5565 "raw ampersands inside processing instructions are valid and must survive sanitization"
5566 );
5567 roxmltree::Document::parse(&stripped)
5568 .expect("processing instructions must remain parseable");
5569 }
5570
5571 #[test]
5572 fn strip_undefined_entities_drops_only_true_named_entity_references() {
5573 let xml = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/"><subform name="form1"><draw name="D"><value><text>alpha &bogus; beta © & gamma</text></value></draw></subform></template>"#;
5574
5575 let stripped = strip_undefined_xml_entities(xml);
5576
5577 assert!(
5578 !stripped.contains("&bogus;"),
5579 "unknown named entities should still be removed for roxmltree compatibility"
5580 );
5581 assert!(stripped.contains("©"));
5582 assert!(stripped.contains("&"));
5583 roxmltree::Document::parse(&stripped).expect("sanitized XML should parse");
5584 }
5585
5586 #[test]
5589 fn form_dom_expands_repeating_subform_instances() {
5590 use xfa_layout_engine::form::FormNodeType;
5591
5592 let template = r#"<template xmlns="http://www.xfa.org/schema/xfa-template/3.3/">
5594 <subform name="root" layout="tb">
5595 <pageSet><pageArea name="P1">
5596 <contentArea w="200mm" h="280mm"/>
5597 <medium short="210mm" long="297mm"/>
5598 </pageArea></pageSet>
5599 <subform name="body" layout="tb">
5600 <subform name="Items" layout="tb">
5601 <bind match="none"/>
5602 <subform name="Row" layout="tb">
5603 <bind match="none"/>
5604 <occur max="-1"/>
5605 <field name="Label"><ui><textEdit/></ui></field>
5606 </subform>
5607 </subform>
5608 </subform>
5609 </subform>
5610 </template>"#;
5611
5612 let form_xml = r#"<form xmlns="http://www.xfa.org/schema/xfa-form/2.8/">
5614 <subform name="root">
5615 <subform name="body">
5616 <subform name="Items">
5617 <instanceManager name="_Row"/>
5618 <subform name="Row">
5619 <field name="Label"><value><text>Alpha</text></value></field>
5620 </subform>
5621 <subform name="Row">
5622 <field name="Label"><value><text>Beta</text></value></field>
5623 </subform>
5624 <subform name="Row">
5625 <field name="Label"><value><text>Gamma</text></value></field>
5626 </subform>
5627 </subform>
5628 </subform>
5629 </subform>
5630 </form>"#;
5631
5632 let data_dom = xfa_dom_resolver::data_dom::DataDom::new();
5633 let merger = crate::merger::FormMerger::new(&data_dom);
5634 let (mut tree, root_id) = merger.merge(template).unwrap();
5635
5636 fn find_by_name(tree: &FormTree, parent: FormNodeId, name: &str) -> Option<FormNodeId> {
5639 for &c in &tree.get(parent).children {
5640 if tree.get(c).name == name {
5641 return Some(c);
5642 }
5643 if let Some(found) = find_by_name(tree, c, name) {
5644 return Some(found);
5645 }
5646 }
5647 None
5648 }
5649 let items_id =
5650 find_by_name(&tree, root_id, "Items").expect("Items subform not found in tree");
5651 let rows_before = tree
5652 .get(items_id)
5653 .children
5654 .iter()
5655 .filter(|&&c| tree.get(c).name == "Row")
5656 .count();
5657 assert_eq!(
5658 rows_before, 1,
5659 "template merge should produce 1 Row (bind=none)"
5660 );
5661
5662 apply_form_dom_presence(&mut tree, root_id, form_xml);
5664
5665 let rows_after: Vec<FormNodeId> = tree
5667 .get(items_id)
5668 .children
5669 .iter()
5670 .filter(|&&c| tree.get(c).name == "Row")
5671 .copied()
5672 .collect();
5673 assert_eq!(
5674 rows_after.len(),
5675 3,
5676 "form DOM should expand to 3 Row instances"
5677 );
5678
5679 let values: Vec<String> = rows_after
5680 .iter()
5681 .map(|&row_id| {
5682 let label_id = tree.get(row_id).children[0];
5683 match &tree.get(label_id).node_type {
5684 FormNodeType::Field { value } => value.clone(),
5685 _ => String::new(),
5686 }
5687 })
5688 .collect();
5689 assert_eq!(values, vec!["Alpha", "Beta", "Gamma"]);
5690 }
5691
5692 #[test]
5698 fn flatten_xfa_to_pdf_recursion_guard_returns_error() {
5699 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5700 let result = flatten_xfa_to_pdf_simulate_reentrant(&pdf_bytes);
5701 assert!(
5702 result.is_err(),
5703 "expected recursion guard to return Err, got Ok"
5704 );
5705 let err_msg = result.unwrap_err().to_string();
5706 assert!(
5707 err_msg.contains("recursively"),
5708 "expected error message to mention recursion, got: {err_msg}"
5709 );
5710 }
5711
5712 #[test]
5715 fn flatten_xfa_to_pdf_depth_counter_resets_after_call() {
5716 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5717 let _ = flatten_xfa_to_pdf(&pdf_bytes);
5719 let pdf_bytes2 = build_xfa_pdf(SIMPLE_XDP);
5721 let result = flatten_xfa_to_pdf(&pdf_bytes2);
5722 assert!(
5723 result.is_ok(),
5724 "second flatten call should succeed, got: {result:?}"
5725 );
5726 }
5727
5728 #[test]
5732 fn flatten_xfa_to_pdf_does_not_panic_on_empty_input() {
5733 let result = flatten_xfa_to_pdf(&[]);
5734 let _ = result;
5738 }
5739
5740 #[test]
5748 fn flatten_pipeline_completes_on_minimal_xfa_pdf() {
5749 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5750 let result = flatten_xfa_to_pdf(&pdf_bytes);
5754 let _ = result; }
5756
5757 #[test]
5758 fn flatten_with_layout_dump_preserves_pdf_bytes() {
5759 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5760 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("plain flatten should succeed");
5761 let (flattened_with_dump, layout_dump) =
5762 flatten_xfa_to_pdf_with_layout_dump(&pdf_bytes).expect("dump flatten should succeed");
5763
5764 assert_eq!(flattened_with_dump, flattened);
5765 assert!(!layout_dump.pages.is_empty());
5766 assert_eq!(layout_dump.pages[0].page_num, 1);
5767 assert!(layout_dump.pages[0].used_height <= layout_dump.pages[0].page_height);
5768 }
5769
5770 #[test]
5776 fn flatten_removes_needs_rendering() {
5777 let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5779 {
5781 let mut doc = Document::load_mem(&pdf_bytes).expect("parse for NeedsRendering test");
5782 let root_id = match doc.trailer.get(b"Root") {
5783 Ok(Object::Reference(id)) => *id,
5784 _ => panic!("no Root in trailer"),
5785 };
5786 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(root_id) {
5787 dict.set("NeedsRendering", Object::Boolean(true));
5788 }
5789 let mut out = Vec::new();
5790 doc.save_to(&mut out)
5791 .expect("re-save for NeedsRendering test");
5792 pdf_bytes = out;
5793 }
5794
5795 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5797 let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
5798 let root_id = match doc.trailer.get(b"Root") {
5799 Ok(Object::Reference(id)) => *id,
5800 _ => panic!("no Root in flattened trailer"),
5801 };
5802 let catalog = doc.get_dictionary(root_id).expect("catalog dict");
5803 assert!(
5804 catalog.get(b"NeedsRendering").is_err(),
5805 "/NeedsRendering must be absent after flatten"
5806 );
5807 }
5808
5809 #[test]
5812 fn flatten_removes_xfa_entry() {
5813 let pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5814 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5815
5816 let flattened_str = String::from_utf8_lossy(&flattened);
5819 assert!(
5820 !flattened_str.contains("/XFA"),
5821 "/XFA must be absent from flattened output, but was found"
5822 );
5823 }
5824
5825 #[test]
5826 fn remove_acroform_purges_xfa_packet_objects() {
5827 let (mut doc, acroform_id, xfa_ids) = build_xfa_doc_with_xfa_array();
5828
5829 remove_acroform(&mut doc);
5830
5831 assert!(
5832 !doc.objects.contains_key(&acroform_id),
5833 "AcroForm object should be removed from doc.objects"
5834 );
5835 for xfa_id in &xfa_ids {
5836 assert!(
5837 !doc.objects.contains_key(xfa_id),
5838 "XFA packet object {xfa_id:?} should be removed from doc.objects"
5839 );
5840 }
5841
5842 let mut out = Vec::new();
5843 doc.save_to(&mut out).expect("save cleaned PDF");
5844 let out_str = String::from_utf8_lossy(&out);
5845 assert!(
5846 !out_str.contains("xdp:xdp"),
5847 "serialized output should not contain orphaned XFA packet payloads"
5848 );
5849 assert!(
5850 !out_str.contains("<template"),
5851 "serialized output should not contain orphaned template payloads"
5852 );
5853 }
5854
5855 #[test]
5857 fn flatten_removes_empty_annots_arrays() {
5858 let mut pdf_bytes = build_xfa_pdf(SIMPLE_XDP);
5860 {
5861 let mut doc = Document::load_mem(&pdf_bytes).expect("parse for annots test");
5862 let page_id = doc.page_iter().next().expect("at least one page");
5863 if let Ok(Object::Dictionary(ref mut dict)) = doc.get_object_mut(page_id) {
5864 dict.set("Annots", Object::Array(vec![]));
5865 }
5866 let mut out = Vec::new();
5867 doc.save_to(&mut out).expect("re-save for annots test");
5868 pdf_bytes = out;
5869 }
5870
5871 let flattened = flatten_xfa_to_pdf(&pdf_bytes).expect("flatten failed");
5872 let doc = Document::load_mem(&flattened).expect("parse flattened PDF");
5873 for page_id in doc.page_iter() {
5874 let page = doc.get_dictionary(page_id).expect("page dict");
5875 match page.get(b"Annots") {
5876 Ok(Object::Array(arr)) => {
5877 assert!(
5878 !arr.is_empty(),
5879 "page {:?}: /Annots must either be absent or non-empty after flatten",
5880 page_id
5881 );
5882 }
5883 _ => {} }
5885 }
5886 }
5887
5888 #[test]
5889 fn remove_acroform_strips_widgets_from_indirect_annots_arrays() {
5890 let appearance = Object::Stream(Stream::new(
5891 dictionary! {
5892 "Type" => Object::Name(b"XObject".to_vec()),
5893 "Subtype" => Object::Name(b"Form".to_vec()),
5894 "BBox" => Object::Array(vec![
5895 Object::Integer(0), Object::Integer(0),
5896 Object::Integer(20), Object::Integer(20),
5897 ]),
5898 "Resources" => Object::Dictionary(dictionary! {}),
5899 },
5900 b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
5901 ));
5902 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5903 Vec::new(),
5904 appearance,
5905 dictionary! {
5906 "FT" => Object::Name(b"Tx".to_vec()),
5907 "T" => Object::string_literal("field[0]"),
5908 },
5909 );
5910
5911 let mut doc = Document::load_mem(&pdf_bytes).expect("parse test PDF");
5912 let page_id = doc.page_iter().next().expect("page");
5913 let annots = page_annotations(&doc, page_id);
5914 let annots_id = doc.add_object(Object::Array(annots));
5915 if let Ok(Object::Dictionary(ref mut page_dict)) = doc.get_object_mut(page_id) {
5916 page_dict.set("Annots", Object::Reference(annots_id));
5917 }
5918
5919 remove_acroform(&mut doc);
5920
5921 let page = doc.get_dictionary(page_id).expect("page dict");
5922 assert!(
5923 page.get(b"Annots").is_err(),
5924 "widget-only indirect /Annots must be removed"
5925 );
5926 }
5927
5928 #[test]
5929 fn acroform_without_xfa_falls_back_to_static_cleanup() {
5930 let appearance = Object::Stream(Stream::new(
5931 dictionary! {
5932 "Type" => Object::Name(b"XObject".to_vec()),
5933 "Subtype" => Object::Name(b"Form".to_vec()),
5934 "BBox" => Object::Array(vec![
5935 Object::Integer(0), Object::Integer(0),
5936 Object::Integer(20), Object::Integer(20),
5937 ]),
5938 "Resources" => Object::Dictionary(dictionary! {}),
5939 },
5940 b"BT /F1 8 Tf 1 1 Td (X) Tj ET\n".to_vec(),
5941 ));
5942 let pdf_bytes = build_xfa_pdf_with_widget_appearance(
5943 Vec::new(),
5944 appearance,
5945 dictionary! {
5946 "FT" => Object::Name(b"Tx".to_vec()),
5947 "T" => Object::string_literal("field[0]"),
5948 },
5949 );
5950
5951 let mut doc = Document::load_mem(&pdf_bytes).expect("parse source PDF");
5952 let root_id = match doc.trailer.get(b"Root") {
5953 Ok(Object::Reference(id)) => *id,
5954 _ => panic!("no Root"),
5955 };
5956 let acroform_id = doc
5957 .get_dictionary(root_id)
5958 .expect("catalog")
5959 .get(b"AcroForm")
5960 .expect("AcroForm")
5961 .as_reference()
5962 .expect("AcroForm ref");
5963 if let Ok(Object::Dictionary(ref mut acroform)) = doc.get_object_mut(acroform_id) {
5964 acroform.remove(b"XFA");
5965 }
5966 let mut acroform_only = Vec::new();
5967 doc.save_to(&mut acroform_only)
5968 .expect("save AcroForm-only PDF");
5969
5970 let flattened = flatten_xfa_to_pdf(&acroform_only).expect("flatten failed");
5971 let flattened_doc = Document::load_mem(&flattened).expect("parse flattened PDF");
5972 let root_id = match flattened_doc.trailer.get(b"Root") {
5973 Ok(Object::Reference(id)) => *id,
5974 _ => panic!("no Root in flattened PDF"),
5975 };
5976 let catalog = flattened_doc
5977 .get_dictionary(root_id)
5978 .expect("flattened catalog");
5979 assert!(
5980 catalog.get(b"AcroForm").is_err(),
5981 "AcroForm-only PDFs should still be cleaned by flatten"
5982 );
5983
5984 let page_id = flattened_doc.page_iter().next().expect("flattened page");
5985 assert!(
5986 page_annotations(&flattened_doc, page_id).is_empty(),
5987 "flattened AcroForm-only PDFs should not retain widget annotations"
5988 );
5989 }
5990
5991 #[test]
5997 fn validate_flattened_pdf_clean_pdf_passes() {
5998 let mut doc = Document::with_version("1.4");
6000 let pages_id = doc.new_object_id();
6001 let page_id = doc.add_object(Object::Dictionary(dictionary! {
6002 "Type" => Object::Name(b"Page".to_vec()),
6003 "Parent" => Object::Reference(pages_id),
6004 "MediaBox" => Object::Array(vec![
6005 Object::Integer(0), Object::Integer(0),
6006 Object::Integer(612), Object::Integer(792),
6007 ])
6008 }));
6009 doc.objects.insert(
6010 pages_id,
6011 Object::Dictionary(dictionary! {
6012 "Type" => Object::Name(b"Pages".to_vec()),
6013 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
6014 "Count" => Object::Integer(1)
6015 }),
6016 );
6017 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
6018 "Type" => Object::Name(b"Catalog".to_vec()),
6019 "Pages" => Object::Reference(pages_id)
6020 }));
6021 doc.trailer.set("Root", Object::Reference(catalog_id));
6022 let mut pdf_bytes = Vec::new();
6023 doc.save_to(&mut pdf_bytes).expect("save clean PDF");
6024
6025 let validation = validate_flattened_pdf(&pdf_bytes).expect("validate failed");
6026 assert!(
6027 validation.has_no_acroform,
6028 "clean PDF should have no AcroForm"
6029 );
6030 assert!(validation.has_no_xfa, "clean PDF should have no XFA");
6031 assert!(
6032 validation.has_no_needs_rendering,
6033 "clean PDF should have no NeedsRendering"
6034 );
6035 assert_eq!(validation.page_count, 1, "clean PDF should report 1 page");
6036 assert!(
6037 validation.warnings.is_empty(),
6038 "clean PDF should produce no warnings, got: {:?}",
6039 validation.warnings
6040 );
6041 }
6042
6043 #[test]
6045 fn validate_flattened_pdf_does_not_panic_on_empty_input() {
6046 let result = validate_flattened_pdf(&[]);
6047 assert!(
6049 result.is_ok(),
6050 "expected Ok from empty input, got: {:?}",
6051 result.err()
6052 );
6053 let v = result.unwrap();
6054 assert_eq!(v.page_count, 0, "empty input has 0 pages");
6055 assert!(
6056 !v.warnings.is_empty(),
6057 "empty input should produce at least one warning"
6058 );
6059 }
6060
6061 #[test]
6067 fn compare_flatten_quality_page_count_comparison() {
6068 let original = build_xfa_pdf(SIMPLE_XDP);
6069 let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6070 let metrics =
6071 compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6072 assert!(
6074 metrics.page_count_before >= 1,
6075 "original must have >= 1 page"
6076 );
6077 assert!(
6078 metrics.page_count_after >= 1,
6079 "flattened must have >= 1 page"
6080 );
6081 assert_eq!(
6083 metrics.page_count_match,
6084 metrics.page_count_before == metrics.page_count_after,
6085 "page_count_match must equal page_count_before == page_count_after"
6086 );
6087 }
6088
6089 #[test]
6091 fn compare_flatten_quality_content_ratio_computed() {
6092 let original = build_xfa_pdf(SIMPLE_XDP);
6093 let flattened = flatten_xfa_to_pdf(&original).expect("flatten failed");
6094 let metrics =
6095 compare_flatten_quality(&original, &flattened).expect("compare_flatten_quality failed");
6096 assert!(
6098 metrics.content_ratio.is_finite() && metrics.content_ratio >= 0.0,
6099 "content_ratio must be finite and >= 0, got: {}",
6100 metrics.content_ratio
6101 );
6102 let expected = if metrics.content_stream_bytes_before == 0 {
6104 1.0_f64
6105 } else {
6106 metrics.content_stream_bytes_after as f64 / metrics.content_stream_bytes_before as f64
6107 };
6108 assert!(
6109 (metrics.content_ratio - expected).abs() < 1e-9,
6110 "content_ratio mismatch: expected {expected}, got {}",
6111 metrics.content_ratio
6112 );
6113 }
6114
6115 #[test]
6122 fn validate_text_completeness_no_datasets_returns_perfect_ratio() {
6123 let xdp = r#"<?xml version="1.0"?>
6125<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
6126 <template>
6127 <subform name="root">
6128 <field name="greeting"><ui><textEdit/></ui></field>
6129 </subform>
6130 </template>
6131</xdp:xdp>"#;
6132 let original = build_xfa_pdf(xdp);
6133 let mut doc = Document::with_version("1.4");
6135 let pages_id = doc.new_object_id();
6136 let page_id = doc.add_object(Object::Dictionary(dictionary! {
6137 "Type" => Object::Name(b"Page".to_vec()),
6138 "Parent" => Object::Reference(pages_id),
6139 "MediaBox" => Object::Array(vec![
6140 Object::Integer(0), Object::Integer(0),
6141 Object::Integer(612), Object::Integer(792),
6142 ])
6143 }));
6144 doc.objects.insert(
6145 pages_id,
6146 Object::Dictionary(dictionary! {
6147 "Type" => Object::Name(b"Pages".to_vec()),
6148 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
6149 "Count" => Object::Integer(1)
6150 }),
6151 );
6152 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
6153 "Type" => Object::Name(b"Catalog".to_vec()),
6154 "Pages" => Object::Reference(pages_id)
6155 }));
6156 doc.trailer.set("Root", Object::Reference(catalog_id));
6157 let mut flattened = Vec::new();
6158 doc.save_to(&mut flattened).unwrap();
6159
6160 let result = validate_text_completeness(&original, &flattened)
6161 .expect("validate_text_completeness should not fail");
6162 assert!(
6163 result.expected_values.is_empty(),
6164 "no datasets packet means no expected values"
6165 );
6166 assert_eq!(
6167 result.completeness_ratio, 1.0,
6168 "empty expected set should yield ratio 1.0"
6169 );
6170 }
6171
6172 #[test]
6174 fn validate_text_completeness_empty_inputs_do_not_panic() {
6175 let result = validate_text_completeness(&[], &[]);
6176 assert!(result.is_ok(), "should return Ok on empty inputs");
6177 let v = result.unwrap();
6178 assert_eq!(v.completeness_ratio, 1.0);
6179 assert!(v.expected_values.is_empty());
6180 assert!(v.missing_values.is_empty());
6181 }
6182
6183 #[test]
6193 fn flatten_empty_bytes_does_not_panic_and_does_not_error() {
6194 let result = flatten_xfa_to_pdf(b"");
6197 match result {
6199 Ok(_) => {}
6200 Err(_) => {} }
6202 }
6203
6204 #[test]
6207 fn flatten_non_xfa_bytes_returns_input_unchanged() {
6208 let input = b"%PDF-1.4\n%%EOF\n";
6212 let result = flatten_xfa_to_pdf(input);
6213 match result {
6214 Ok(out) => assert_eq!(out, input, "non-XFA input should pass through unchanged"),
6215 Err(_) => {} }
6217 }
6218}